Actions

icon Post
text/html Subscribe
text/html Unsubscribe

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[vsipl++] [patch] Merge 1.4 fixes into trunk


  • To: VSIPL++ Developers List <vsipl++@xxxxxxxxxxxxxxxx>
  • Subject: [vsipl++] [patch] Merge 1.4 fixes into trunk
  • From: Jules Bergmann <jules@xxxxxxxxxxxxxxxx>
  • Date: Tue, 18 Mar 2008 11:46:48 -0400

This patch merges the recent VSIPL++ 1.4 fixes into trunk.

Patch applied.

			-- Jules
--
Jules Bergmann
CodeSourcery
jules@xxxxxxxxxxxxxxxx
(650) 331-3385 x705
Index: ChangeLog
===================================================================
--- ChangeLog	(revision 192554)
+++ ChangeLog	(working copy)
@@ -1,3 +1,49 @@
+2008-03-18  Jules Bergmann  <jules@xxxxxxxxxxxxxxxx>
+
+	Merge 1.4 MCOE updates.
+	* configure.ac (CXXDEP): Update for ccmc++.
+	* m4/fft.m4 (--with-fftw3-cfg-opts): New option, passes options
+	  directly to FFTW3 configure.
+	* src/vsip/core/fns_scalar.hpp: Handle missing hypot decl.
+	* src/vsip/opt/sal/conv.hpp: Loosen threshold on SAL td convolution.
+	* vendor/GNUmakefile.inc.in: Use LIBEXT for FFTW3.
+	* tests/matvec.cpp: Fill in macros missing from MCOE GCC's cmath.
+	* examples/mercury/mcoe-setup.sh: Update.
+	
+	Merge 1.4 SIMD unaligned loop fusion changes.
+	* src/vsip/opt/simd/expr_evaluator.hpp: Move Proxy_factory ...
+	* src/vsip/opt/simd/proxy_factory.hpp: New file, ... to here.
+	* src/vsip/opt/simd/eval_unaligned.hpp: New file, unaligned SIMD
+	  loop-fusion evaluator from expr_evaluator.hpp.
+	* src/vsip/opt/simd/expr_iterator.hpp (Simd_unaligned_loader): Move
+	  loads around to avoid second load past end of vector
+	  (first load inevitable).
+	* src/vsip/opt/expr/serial_dispatch.hpp: Include eval_unaligned.hpp.
+	* configure.ac (--enable-simd-unaligned-loop-fusion): Allow SIMD
+	  unaligned loop fusion to be controlled independently of aligned
+	  loop fusion.
+	* doc/quickstart/quickstart.xml: Document --enable-simd-loop-fusion
+	  and --enable-simd-unaligned-loop-fusion.
+
+	Merge 1.4 bugfix for builtin SIMD routine unaligned handling.
+	* src/vsip/opt/simd/rscvmul.hpp: Fix bug in handling unalignment.
+	* src/vsip/opt/simd/threshold.hpp: Likewise.
+	* src/vsip/opt/simd/vgt.hpp: Likewise.
+	* src/vsip/opt/simd/vma_ip_csc.hpp: Likewise.
+	* src/vsip/opt/simd/vaxpy.hpp: Likewise.
+	* src/vsip/opt/simd/vadd.hpp: Likewise.
+	* src/vsip/opt/simd/vlogic.hpp: Likewise.
+	* src/vsip/opt/simd/vmul.hpp: Likewise.
+	* tests/regressions/view_offset.cpp: New test, regression coverage
+	  for unalignment handling bug.
+
+	Merge 1.4 pwarp and misc changes.
+	* src/vsip_csl/img/impl/pwarp_simd.hpp: Clear u/v if out of bounds,
+	  add error checking.
+	* tests/vsip_csl/pwarp.cpp: Merge from afrl branch.
+	* tests/regressions/transpose_assign.cpp: Add runtime verbosity.
+	* tests/threshold.cpp: Add debug output.
+
 2008-02-04  Jules Bergmann  <jules@xxxxxxxxxxxxxxxx>
 
 	* configure.ac (EMBED_SPU): Include -m32/-m64 option.
Index: m4/fft.m4
===================================================================
--- m4/fft.m4	(revision 191870)
+++ m4/fft.m4	(working copy)
@@ -41,6 +41,11 @@
                  [Specify CFLAGS to use when building built-in FFTW3.
 		  Only used if --with-fft=builtin.]))
 
+AC_ARG_WITH(fftw3_cfg_opts,
+  AS_HELP_STRING([--with-fftw3-cfg-opts=OPTS],
+                 [Specify additional options to use when configuring built-in
+                  FFTW3. Only used if --with-fft=builtin.]))
+
 AC_ARG_ENABLE(fftw3_simd,
   AS_HELP_STRING([--disable-fftw3-simd],
                  [Disable use of SIMD instructions by FFTW3.  Useful
@@ -265,7 +270,7 @@
       mkdir -p vendor/fftw3f
       AC_MSG_NOTICE([Configuring fftw3f (float).])
       AC_MSG_NOTICE([extra config options: '$fftw3_f_simd'.])
-      (cd vendor/fftw3f; $fftw3_configure CC="$fftw_CC" $fftw3_f_simd $fftw3_opts --enable-float)
+      (cd vendor/fftw3f; $fftw3_configure CC="$fftw_CC" $fftw3_f_simd $fftw3_opts $with_fftw3_cfg_opts --enable-float)
       libs="$libs -lfftw3f"
     fi
     if test "$enable_fft_double" = yes; then
@@ -273,7 +278,7 @@
       mkdir -p vendor/fftw3
       AC_MSG_NOTICE([Configuring fftw3 (double).])
       AC_MSG_NOTICE([extra config options: '$fftw3_d_simd'.])
-      (cd vendor/fftw3; $fftw3_configure CC="$fftw_CC" $fftw3_d_simd $fftw3_opts )
+      (cd vendor/fftw3; $fftw3_configure CC="$fftw_CC" $fftw3_d_simd $fftw3_opts $with_fftw3_cfg_opts )
       libs="$libs -lfftw3"
     fi
     if test "$enable_fft_long_double" = yes; then
@@ -282,7 +287,7 @@
       mkdir -p vendor/fftw3l
       AC_MSG_NOTICE([Configuring fftw3l (long double).])
       AC_MSG_NOTICE([extra config options: '$fftw3_l_simd'.])
-      (cd vendor/fftw3l; $fftw3_configure CC="$fftw_CC" $fftw3_l_simd $fftw3_opts --enable-long-double)
+      (cd vendor/fftw3l; $fftw3_configure CC="$fftw_CC" $fftw3_l_simd $fftw3_opts $with_fftw3_cfg_opts --enable-long-double)
       libs="$libs -lfftw3l"
     fi
 
Index: configure.ac
===================================================================
--- configure.ac	(revision 192554)
+++ configure.ac	(working copy)
@@ -303,9 +303,15 @@
 
 AC_ARG_ENABLE([simd_loop_fusion],
   AS_HELP_STRING([--enable-simd-loop-fusion],
-                 [Enable SIMD loop-fusion.]),,
+                 [Enable SIMD loop-fusion (Disable by default).]),,
   [enable_simd_loop_fusion=no])
 
+AC_ARG_ENABLE([simd_unaligned_loop_fusion],
+  AS_HELP_STRING([--enable-simd-unaligned-loop-fusion],
+                 [Enable SIMD loop-fusion for unaligned expressions
+                  (Follows --enable-simd-loop-fusion by default).]),,
+  [enable_simd_unaligned_loop_fusion=default])
+
 AC_ARG_WITH([builtin_simd_routines],
   AS_HELP_STRING([--with-builtin-simd-routines=WHAT],
                  [Use builtin SIMD routines.]),,
@@ -375,6 +381,12 @@
   BOOST_VERSION="1.33"
 )
 
+AC_ARG_ENABLE([huge_page_pool],
+  AS_HELP_STRING([--disable-huge-page-pool],
+                 [Disable support for huge page memory allocator pool
+                  (enabled by default)]),,
+  [enable_huge_page_pool=yes])
+
 #
 # Files to generate.
 #
@@ -455,6 +467,8 @@
   CXXDEP="$CXX /QM"
   INTEL_WIN=1
   cygwin_mount=`cygpath -w /`
+elif test "$CXX" == "ccmc++"; then
+  CXXDEP="$CXX -M"
 else
   CXXDEP="$CXX -M -x c++"
   cygwin_mount=
@@ -864,13 +878,22 @@
 #
 # Configure use of SIMD loop-fusion
 #
+if test "$enable_simd_unaligned_loop_fusion" = "default"; then
+  enable_simd_unaligned_loop_fusion=$enable_simd_loop_fusion
+fi
+
 if test "$enable_simd_loop_fusion" = "yes"; then
   AC_DEFINE_UNQUOTED(VSIP_IMPL_HAVE_SIMD_LOOP_FUSION, 1,
     [Define whether to use SIMD loop-fusion in expr dispatch.])
 fi
 
+if test "$enable_simd_unaligned_loop_fusion" = "yes"; then
+  AC_DEFINE_UNQUOTED(VSIP_IMPL_HAVE_SIMD_UNALIGNED_LOOP_FUSION, 1,
+    [Define whether to use SIMD unaligned loop-fusion in expr dispatch.])
+fi
 
 
+
 #
 # Configure use of builtin SIMD routines
 #
@@ -943,6 +966,18 @@
           [Define to indicate this is CodeSourcery's VSIPL++.])
 
 #
+# Configure huge_page_pool support
+#
+AC_CHECK_HEADERS([sys/mman.h], [], [ enable_huge_page_pool="no"], [])
+if test "$enable_huge_page_pool" = "yes"; then
+  AC_DEFINE_UNQUOTED(VSIP_IMPL_ENABLE_HUGE_PAGE_POOL, 1,
+                     [Define to enable huge page pool support.])
+  AC_SUBST(VSIP_IMPL_HAVE_HUGE_PAGE_POOL, 1)
+else
+  AC_SUBST(VSIP_IMPL_HAVE_HUGE_PAGE_POOL, "")
+fi 
+
+#
 # library
 #
 ARFLAGS="r"
@@ -1063,6 +1098,8 @@
 else
   AC_MSG_RESULT([Complex storage format:                  interleaved])
 fi
+AC_MSG_RESULT([Using SIMD aligned loop-fusion           ${enable_simd_loop_fusion}])
+AC_MSG_RESULT([Using SIMD unaligned loop-fusion         ${enable_simd_unaligned_loop_fusion}])
 AC_MSG_RESULT([Timer:                                   ${enable_timer}])
 AC_MSG_RESULT([With Python bindings:                    ${enable_scripting}])
 
Index: src/vsip/core/fns_scalar.hpp
===================================================================
--- src/vsip/core/fns_scalar.hpp	(revision 191870)
+++ src/vsip/core/fns_scalar.hpp	(working copy)
@@ -23,10 +23,15 @@
 #include <complex>
 
 #if !HAVE_DECL_HYPOTF
-#if HAVE_HYPOTF
+# if HAVE_HYPOTF
 extern "C" float hypotf(float, float);
 # endif
 #endif
+#if !HAVE_DECL_HYPOT
+# if HAVE_HYPOT
+extern "C" double hypot(double, double);
+# endif
+#endif
 
 namespace vsip
 {
Index: src/vsip/opt/sal/conv.hpp
===================================================================
--- src/vsip/opt/sal/conv.hpp	(revision 191870)
+++ src/vsip/opt/sal/conv.hpp	(working copy)
@@ -144,8 +144,11 @@
   Definitions
 ***********************************************************************/
 
-// These help enforce limits on the length of the kernel
-// when using SAL, which differ for complex values
+// 080313: These kernel sizes represent cross-over points where
+//   frequency domain convolution may be more efficient.  Currently
+//   we ignore them because we don't use SAL's frequency domain
+//   convolution, and SAL's time-domain convolution is faster than
+//   a generic time-domain convolution.
 template <typename T>
 struct Max_kernel_length
 {
@@ -297,7 +300,8 @@
   stride_type s_coeff = coeff_.block().impl_stride(1, 0);
 
   assert( Max_kernel_length<T>::value != 0 );
-  if ( (M <= Max_kernel_length<T>::value) && (decimation_ == 1) ) 
+  // See note above on Max_kernel_length defn.
+  if ( /*(M <= Max_kernel_length<T>::value) &&*/ (decimation_ == 1) ) 
   {
     // SAL only does the minimum convolution
     if (Supp == support_full)
Index: vendor/GNUmakefile.inc.in
===================================================================
--- vendor/GNUmakefile.inc.in	(revision 192095)
+++ vendor/GNUmakefile.inc.in	(working copy)
@@ -203,7 +203,7 @@
 
 vpath %.h src:$(srcdir)
 
-lib/libfftw3f.a: vendor/fftw3f/.libs/libfftw3f.a
+lib/libfftw3f.$(LIBEXT): vendor/fftw3f/.libs/libfftw3f.a
 	cp $< $@
 
 vendor/fftw3f/.libs/libfftw3f.a:
@@ -214,7 +214,7 @@
 	done
 	@$(MAKE) -C vendor/fftw3f all-am >> fftw-f.build.log 2>&1
 
-lib/libfftw3.a: vendor/fftw3/.libs/libfftw3.a
+lib/libfftw3.$(LIBEXT): vendor/fftw3/.libs/libfftw3.a
 	cp $< $@
 
 vendor/fftw3/.libs/libfftw3.a:
@@ -225,7 +225,7 @@
 	done
 	@$(MAKE) -C vendor/fftw3 all-am >> fftw-d.build.log 2>&1
 
-lib/libfftw3l.a: vendor/fftw3l/.libs/libfftw3l.a
+lib/libfftw3l.$(LIBEXT): vendor/fftw3l/.libs/libfftw3l.a
 	cp $< $@
 
 vendor/fftw3l/.libs/libfftw3l.a:
@@ -238,13 +238,13 @@
 
 ifdef USE_BUILTIN_FFTW
   ifdef USE_BUILTIN_FFTW_FLOAT
-    vendor_FFTW_LIBS += lib/libfftw3f.a
+    vendor_FFTW_LIBS += lib/libfftw3f.$(LIBEXT)
   endif
   ifdef USE_BUILTIN_FFTW_DOUBLE
-    vendor_FFTW_LIBS += lib/libfftw3.a
+    vendor_FFTW_LIBS += lib/libfftw3.$(LIBEXT)
   endif
   ifdef USE_BUILTIN_FFTW_LONG_DOUBLE
-    vendor_FFTW_LIBS += lib/libfftw3l.a
+    vendor_FFTW_LIBS += lib/libfftw3l.$(LIBEXT)
   endif
 
 libs += $(vendor_FFTW_LIBS) 
@@ -255,7 +255,7 @@
 	@for ldir in $(subst .a,,$(subst lib/lib,,$(vendor_FFTW_LIBS))); do \
 	  $(MAKE) -C vendor/$$ldir clean >> fftw.clean.log 2>&1; \
 	  echo "$(MAKE) -C vendor/$$ldir clean "; done
-	rm -f lib/libfftw3.a lib/libfftw3f.a lib/libfftw3l.a
+	rm -f lib/libfftw3.$(LIBEXT) lib/libfftw3f.$(LIBEXT) lib/libfftw3l.$(LIBEXT)
 
 install:: $(vendor_FFTW_LIBS)
 	@echo "Installing FFTW"
Index: examples/mercury/mcoe-setup.sh
===================================================================
--- examples/mercury/mcoe-setup.sh	(revision 191870)
+++ examples/mercury/mcoe-setup.sh	(working copy)
@@ -138,18 +138,18 @@
   ex_off_flags="--no_exceptions"
   ex_on_flags="--exceptions"
 
-  fftw3_cflags="-Ospeed $toolset_flag"
+  fftw3_cflags="-Ospeed $pflags $toolset_flag"
 else
-  toolset_flag="-compiler GCC"
+  toolset_flag="-compiler GNU"
   cxxflags="$pflags $toolset_flag"
 
-  opt_flags="-Ospeed -OI -DNDEBUG"
+  opt_flags="-Otime -DNDEBUG -w"
   dbg_flags="-g"
 
   ex_off_flags="-fno-exceptions"
   ex_o_flags=""				# exceptions enabled by default.
 
-  fftw3_cflags="-Ospeed $toolset_flag"
+  fftw3_cflags="-Otime $pflags $toolset_flag"
 fi
 
 if test $opt = "y"; then
@@ -169,7 +169,7 @@
 fi
 
 if test $sal = "y"; then
-  cfg_flags="$cfg_flags --enable-sal"
+  cfg_flags="$cfg_flags --with-sal"
 fi
 
 if test $exceptions = "n"; then
@@ -215,13 +215,14 @@
 # run configure
 
 echo "$dir/configure"
-$dir/configure					\
-	--prefix=$prefix			\
-	--host=powerpc				\
-	--enable-fft=$fft			\
-	--with-fftw3-cflags="$fftw3_cflags"	\
-	--with-complex=$fmt			\
-	--with-lapack=no			\
-	$cfg_flags				\
-	--with-test-level=$testlevel		\
+$dir/configure						\
+	--prefix=$prefix				\
+	--host=powerpc					\
+	--enable-fft=$fft				\
+	--with-fftw3-cflags="$fftw3_cflags"		\
+	--with-fftw3-cfg-opts="--with-our-malloc16"	\
+	--with-complex=$fmt				\
+	--with-lapack=no				\
+	$cfg_flags					\
+	--with-test-level=$testlevel			\
 	--enable-timer=$timer
Index: src/vsip/opt/simd/expr_evaluator.hpp
===================================================================
--- src/vsip/opt/simd/expr_evaluator.hpp	(revision 191870)
+++ src/vsip/opt/simd/expr_evaluator.hpp	(working copy)
@@ -1,4 +1,4 @@
-/* Copyright (c) 2006, 2007 by CodeSourcery.  All rights reserved.
+/* Copyright (c) 2006, 2007, 2008 by CodeSourcery.  All rights reserved.
 
    This file is available for license from CodeSourcery, Inc. under the terms
    of a commercial license and under the GPL.  It is not part of the VSIPL++
@@ -31,6 +31,7 @@
 #include <vsip/core/metaprogramming.hpp>
 #include <vsip/core/extdata.hpp>
 #include <vsip/opt/expr/serial_evaluator.hpp>
+#include <vsip/opt/simd/proxy_factory.hpp>
 
 /***********************************************************************
   Definitions
@@ -40,213 +41,9 @@
 {
 namespace impl
 {
-namespace simd
-{
 
-template <typename BlockT, bool A>
-struct Proxy_factory
-{
-  typedef Direct_access_traits<typename BlockT::value_type> access_traits;
-  typedef Proxy<access_traits, A> proxy_type;
-  typedef typename Adjust_layout_dim<
-                     1, typename Block_layout<BlockT>::layout_type>::type
-		layout_type;
+// SIMD Loop Fusion evaluator for aligned expressions.
 
-  static bool const ct_valid = Ext_data_cost<BlockT>::value == 0 &&
-    !Is_split_block<BlockT>::value;
-
-  static bool 
-  rt_valid(BlockT const &b, int alignment)
-  {
-    Ext_data<BlockT, layout_type> dda(b, SYNC_IN);
-    return dda.stride(0) == 1 && 
-      (!A ||
-       Simd_traits<typename BlockT::value_type>::alignment_of(dda.data()) ==
-       alignment);
-  }
-
-  static int
-  alignment(BlockT const &b)
-  {
-    Ext_data<BlockT, layout_type> dda(b, SYNC_IN);
-    return Simd_traits<typename BlockT::value_type>::alignment_of(dda.data());
-  }
-
-  static proxy_type
-  create(BlockT const &b) 
-  {
-    Ext_data<BlockT, layout_type> dda(b, SYNC_IN);
-    return proxy_type(dda.data());
-  }
-};
-
-template <typename T, bool A>
-struct Proxy_factory<Scalar_block<1, T>, A>
-{
-  typedef Scalar_access_traits<T> access_traits;
-  typedef Proxy<access_traits, A> proxy_type;
-  static bool const ct_valid = true;
-
-  static bool 
-  rt_valid(Scalar_block<1, T> const &, int) {return true;}
-
-  static proxy_type
-  create(Scalar_block<1, T> const &b) 
-  {
-    return proxy_type(b.value());
-  }
-};
-
-template <dimension_type D,
-	  template <typename> class O,
-	  typename B,
-	  typename T,
-	  bool A>
-struct Proxy_factory<Unary_expr_block<D, O, B, T> const, A>
-{
-  typedef 
-    Unary_access_traits<typename Proxy_factory<B,A>::proxy_type, O>
-    access_traits;
-  typedef Proxy<access_traits,A> proxy_type;
-
-  static bool const ct_valid =
-    Unary_operator_map<T, O>::is_supported &&
-    Type_equal<typename B::value_type, T>::value &&
-    Proxy_factory<B, A>::ct_valid;
-
-  static bool 
-  rt_valid(Unary_expr_block<D, O, B, T> const &b, int alignment)
-  {
-    return Proxy_factory<B, A>::rt_valid(b.op(), alignment);
-  }
-
-  static proxy_type
-  create(Unary_expr_block<D, O, B, T> const &b)
-  {
-    return proxy_type(Proxy_factory<B, A>::create(b.op()));
-  }
-};
-
-// This proxy is specialized for unaligned blocks. If the user specifies
-// ualigned(block), this is a hint to switch to an unaligned proxy.
-template <dimension_type D,
-	  typename B,
-	  typename T,
-	  bool A>
-struct Proxy_factory<Unary_expr_block<D, unaligned_functor, B, T> const, A>
-{
-  typedef typename Proxy_factory<B, false>::access_traits access_traits;
-  typedef Proxy<access_traits,false> proxy_type;
-  static bool const ct_valid = Proxy_factory<B,false>::ct_valid;
-
-
-  static bool 
-  rt_valid(Unary_expr_block<D, unaligned_functor, B, T> const &b, int alignment)
-  {
-    return Proxy_factory<B, false>::rt_valid(b.op(), alignment);
-  }
-
-  static proxy_type
-  create(Unary_expr_block<D, unaligned_functor, B, T> const &b)
-  {
-    return proxy_type(Proxy_factory<B, false>::create(b.op()));
-  }
-};
-
-template <dimension_type                D,
-	  template <typename, typename> class O,
-	  typename                      LB,
-	  typename                      LT,
-	  typename                      RB,
-	  typename                      RT,
-	  bool A>
-struct Proxy_factory<Binary_expr_block<D, O, LB, LT, RB, RT> const, A>
-{
-  typedef
-    Binary_access_traits<typename Proxy_factory<LB, A>::proxy_type,
-			 typename Proxy_factory<RB, A>::proxy_type, O> 
-    access_traits;
-  typedef Proxy<access_traits, A> proxy_type;
-  static bool const ct_valid = 
-    Type_equal<typename LB::value_type, LT>::value &&
-    Type_equal<typename RB::value_type, RT>::value &&
-    Type_equal<LT, RT>::value &&
-    Binary_operator_map<LT, O>::is_supported &&
-    Proxy_factory<LB, A>::ct_valid &&
-    Proxy_factory<RB, A>::ct_valid;
-
-  static bool 
-  rt_valid(Binary_expr_block<D, O, LB, LT, RB, RT> const &b, int alignment)
-  {
-    return Proxy_factory<LB, A>::rt_valid(b.left(), alignment) &&
-           Proxy_factory<RB, A>::rt_valid(b.right(), alignment);
-  }
-
-  static proxy_type
-  create(Binary_expr_block<D, O, LB, LT, RB, RT> const &b)
-  {
-    typename Proxy_factory<LB, A>::proxy_type lp =
-      Proxy_factory<LB, A>::create(b.left());
-    typename Proxy_factory<RB, A>::proxy_type rp =
-      Proxy_factory<RB, A>::create(b.right());
-
-    return proxy_type(lp, rp);
-  }
-};
-
-template <dimension_type                         D,
-	  template <typename, typename,typename> class O,
-	  typename                               Block1, typename Type1,
-	  typename                               Block2, typename Type2,
-	  typename                               Block3, typename Type3,
-	  bool A>
-struct Proxy_factory<Ternary_expr_block<D, O,
-  Block1,Type1,Block2,Type2,Block3,Type3> const, A>
-{
-  typedef Ternary_access_traits<typename Proxy_factory<Block1, A>::proxy_type,
-                                typename Proxy_factory<Block2, A>::proxy_type,
-                                typename Proxy_factory<Block3, A>::proxy_type,
-		 	        O> 
-    access_traits;
-
-  typedef Ternary_expr_block<D, O, Block1,Type1,Block2,Type2,Block3,Type3>
-    SrcBlock;
-
-  typedef Proxy<access_traits, A> proxy_type;
-  static bool const ct_valid = 
-    Ternary_operator_map<Type1, O>::is_supported &&
-    Proxy_factory<Block1, A>::ct_valid &&
-    Proxy_factory<Block2, A>::ct_valid &&
-    Proxy_factory<Block3, A>::ct_valid;
-
-  static bool 
-  rt_valid(SrcBlock const &b, int alignment)
-  {
-    return Proxy_factory<Block1, A>::rt_valid(b.first(), alignment) &&
-           Proxy_factory<Block2, A>::rt_valid(b.second(), alignment) &&
-           Proxy_factory<Block3, A>::rt_valid(b.third(), alignment);
-  }
-
-  static proxy_type
-  create(SrcBlock const &b)
-  {
-    typename Proxy_factory<Block1, A>::proxy_type
-      b1p = Proxy_factory<Block1, A>::create(b.first());
-    typename Proxy_factory<Block2, A>::proxy_type
-      b2p = Proxy_factory<Block2, A>::create(b.second());
-    typename Proxy_factory<Block3, A>::proxy_type
-      b3p = Proxy_factory<Block3, A>::create(b.third());
-
-    return proxy_type(b1p,b2p,b3p);
-  }
-};
-
-
-} // namespace vsip::impl::simd
-
-
-// This evaluator is for aligned data only.
-// Look at Simd_unaligned_loop_fusion_tag for unaligned data.
 template <typename LB,
 	  typename RB>
 struct Serial_expr_evaluator<1, LB, RB, Simd_loop_fusion_tag>
@@ -326,78 +123,6 @@
   }
 };
 
-// This evaluator is for unaligned data. Any time any of the blocks are
-// unaligned, we use this evalutator. Basically, in the evaluator list, this
-// evaluator is right after the aligned evaluator and rt_valid determines
-// which one to use.
-template <typename LB,
-	  typename RB>
-struct Serial_expr_evaluator<1, LB, RB, Simd_unaligned_loop_fusion_tag>
-{
-  typedef typename Adjust_layout_dim<
-                     1, typename Block_layout<LB>::layout_type>::type
-		layout_type;
-
-  static char const* name() { return "Expr_SIMD_Unaligned_Loop"; }
-  
-  static bool const ct_valid =
-    // Is SIMD supported at all ?
-    simd::Simd_traits<typename LB::value_type>::is_accel &&
-    // Check that direct access is possible.
-    Ext_data_cost<LB>::value == 0 &&
-    simd::Proxy_factory<RB, false>::ct_valid &&
-    // Only allow float, double, complex<float>,
-    // and complex<double> at this time.
-    (Type_equal<typename Scalar_of<typename LB::value_type>::type, float>::value ||
-     Type_equal<typename Scalar_of<typename LB::value_type>::type, double>::value) &&
-    // Make sure both sides have the same type.
-    Type_equal<typename LB::value_type, typename RB::value_type>::value &&
-    // Make sure the left side is not a complex split block.
-    !Is_split_block<LB>::value;
-
-
-  static bool rt_valid(LB& lhs, RB const& rhs)
-  {
-    Ext_data<LB, layout_type> dda(lhs, SYNC_OUT);
-    return (dda.stride(0) == 1 &&
-	    simd::Simd_traits<typename LB::value_type>::
-	      alignment_of(dda.data()) == 0 &&
-	    simd::Proxy_factory<RB, false>::rt_valid(rhs, 0));
-  }
-
-  static void exec(LB& lhs, RB const& rhs)
-  {
-    typedef typename simd::LValue_access_traits<typename LB::value_type> WAT;
-    typedef typename simd::Proxy_factory<RB, false>::access_traits EAT;
-
-    length_type const vec_size =
-      simd::Simd_traits<typename LB::value_type>::vec_size;
-    Ext_data<LB, layout_type> dda(lhs, SYNC_OUT);
-
-    simd::Proxy<WAT,true>  lp(dda.data());
-    simd::Proxy<EAT,false> rp(simd::Proxy_factory<RB,false>::create(rhs));
-
-    length_type const size = dda.size(0);
-    length_type n = size;
-
-    // loop using proxy interface. This generates the best code
-    // with gcc 3.4 (with gcc 4.1 the difference to the first case
-    // above is negligible).
-
-    while (n >= vec_size)
-    {
-      lp.store(rp.load());
-      n -= vec_size;
-      lp.increment();
-      rp.increment();
-    }
-
-    // Process the remainder, using simple loop fusion.
-    for (index_type i = size - n; i != size; ++i) lhs.put(i, rhs.get(i));
-  }
-};
-
-
 } // namespace vsip::impl
 } // namespace vsip
 
Index: src/vsip/opt/simd/proxy_factory.hpp
===================================================================
--- src/vsip/opt/simd/proxy_factory.hpp	(revision 0)
+++ src/vsip/opt/simd/proxy_factory.hpp	(revision 0)
@@ -0,0 +1,249 @@
+/* Copyright (c) 2006, 2007, 2008 by CodeSourcery.  All rights reserved.
+
+   This file is available for license from CodeSourcery, Inc. under the terms
+   of a commercial license and under the GPL.  It is not part of the VSIPL++
+   reference implementation and is not available under the BSD license.
+*/
+/** @file    vsip/opt/simd/expr_evaluator.hpp
+    @author  Stefan Seefeld
+    @date    2006-07-25
+    @brief   VSIPL++ Library: SIMD expression evaluator proxy factory.
+
+*/
+
+#ifndef VSIP_IMPL_SIMD_PROXY_FACTORY_HPP
+#define VSIP_IMPL_SIMD_PROXY_FACTORY_HPP
+
+#if VSIP_IMPL_REF_IMPL
+# error "vsip/opt files cannot be used as part of the reference impl."
+#endif
+
+/***********************************************************************
+  Included Files
+***********************************************************************/
+
+#include <vsip/support.hpp>
+#include <vsip/opt/simd/simd.hpp>
+#include <vsip/opt/simd/expr_iterator.hpp>
+#include <vsip/core/expr/operations.hpp>
+#include <vsip/core/expr/unary_block.hpp>
+#include <vsip/core/expr/binary_block.hpp>
+#include <vsip/core/metaprogramming.hpp>
+#include <vsip/core/extdata.hpp>
+#include <vsip/opt/expr/serial_evaluator.hpp>
+
+/***********************************************************************
+  Definitions
+***********************************************************************/
+
+namespace vsip
+{
+namespace impl
+{
+namespace simd
+{
+
+template <typename BlockT, bool A>
+struct Proxy_factory
+{
+  typedef Direct_access_traits<typename BlockT::value_type> access_traits;
+  typedef Proxy<access_traits, A> proxy_type;
+  typedef typename Adjust_layout_dim<
+                     1, typename Block_layout<BlockT>::layout_type>::type
+		layout_type;
+
+  static bool const ct_valid = Ext_data_cost<BlockT>::value == 0 &&
+    !Is_split_block<BlockT>::value;
+
+  static bool 
+  rt_valid(BlockT const &b, int alignment)
+  {
+    Ext_data<BlockT, layout_type> dda(b, SYNC_IN);
+    return dda.stride(0) == 1 && 
+      (!A ||
+       Simd_traits<typename BlockT::value_type>::alignment_of(dda.data()) ==
+       alignment);
+  }
+
+  static int
+  alignment(BlockT const &b)
+  {
+    Ext_data<BlockT, layout_type> dda(b, SYNC_IN);
+    return Simd_traits<typename BlockT::value_type>::alignment_of(dda.data());
+  }
+
+  static proxy_type
+  create(BlockT const &b) 
+  {
+    Ext_data<BlockT, layout_type> dda(b, SYNC_IN);
+    return proxy_type(dda.data());
+  }
+};
+
+template <typename T, bool A>
+struct Proxy_factory<Scalar_block<1, T>, A>
+{
+  typedef Scalar_access_traits<T> access_traits;
+  typedef Proxy<access_traits, A> proxy_type;
+  static bool const ct_valid = true;
+
+  static bool 
+  rt_valid(Scalar_block<1, T> const &, int) {return true;}
+
+  static proxy_type
+  create(Scalar_block<1, T> const &b) 
+  {
+    return proxy_type(b.value());
+  }
+};
+
+template <dimension_type D,
+	  template <typename> class O,
+	  typename B,
+	  typename T,
+	  bool A>
+struct Proxy_factory<Unary_expr_block<D, O, B, T> const, A>
+{
+  typedef 
+    Unary_access_traits<typename Proxy_factory<B,A>::proxy_type, O>
+    access_traits;
+  typedef Proxy<access_traits,A> proxy_type;
+
+  static bool const ct_valid =
+    Unary_operator_map<T, O>::is_supported &&
+    Type_equal<typename B::value_type, T>::value &&
+    Proxy_factory<B, A>::ct_valid;
+
+  static bool 
+  rt_valid(Unary_expr_block<D, O, B, T> const &b, int alignment)
+  {
+    return Proxy_factory<B, A>::rt_valid(b.op(), alignment);
+  }
+
+  static proxy_type
+  create(Unary_expr_block<D, O, B, T> const &b)
+  {
+    return proxy_type(Proxy_factory<B, A>::create(b.op()));
+  }
+};
+
+// This proxy is specialized for unaligned blocks. If the user specifies
+// ualigned(block), this is a hint to switch to an unaligned proxy.
+template <dimension_type D,
+	  typename B,
+	  typename T,
+	  bool A>
+struct Proxy_factory<Unary_expr_block<D, unaligned_functor, B, T> const, A>
+{
+  typedef typename Proxy_factory<B, false>::access_traits access_traits;
+  typedef Proxy<access_traits,false> proxy_type;
+  static bool const ct_valid = Proxy_factory<B,false>::ct_valid;
+
+
+  static bool 
+  rt_valid(Unary_expr_block<D, unaligned_functor, B, T> const &b, int alignment)
+  {
+    return Proxy_factory<B, false>::rt_valid(b.op(), alignment);
+  }
+
+  static proxy_type
+  create(Unary_expr_block<D, unaligned_functor, B, T> const &b)
+  {
+    return proxy_type(Proxy_factory<B, false>::create(b.op()));
+  }
+};
+
+template <dimension_type                D,
+	  template <typename, typename> class O,
+	  typename                      LB,
+	  typename                      LT,
+	  typename                      RB,
+	  typename                      RT,
+	  bool A>
+struct Proxy_factory<Binary_expr_block<D, O, LB, LT, RB, RT> const, A>
+{
+  typedef
+    Binary_access_traits<typename Proxy_factory<LB, A>::proxy_type,
+			 typename Proxy_factory<RB, A>::proxy_type, O> 
+    access_traits;
+  typedef Proxy<access_traits, A> proxy_type;
+  static bool const ct_valid = 
+    Type_equal<typename LB::value_type, LT>::value &&
+    Type_equal<typename RB::value_type, RT>::value &&
+    Type_equal<LT, RT>::value &&
+    Binary_operator_map<LT, O>::is_supported &&
+    Proxy_factory<LB, A>::ct_valid &&
+    Proxy_factory<RB, A>::ct_valid;
+
+  static bool 
+  rt_valid(Binary_expr_block<D, O, LB, LT, RB, RT> const &b, int alignment)
+  {
+    return Proxy_factory<LB, A>::rt_valid(b.left(), alignment) &&
+           Proxy_factory<RB, A>::rt_valid(b.right(), alignment);
+  }
+
+  static proxy_type
+  create(Binary_expr_block<D, O, LB, LT, RB, RT> const &b)
+  {
+    typename Proxy_factory<LB, A>::proxy_type lp =
+      Proxy_factory<LB, A>::create(b.left());
+    typename Proxy_factory<RB, A>::proxy_type rp =
+      Proxy_factory<RB, A>::create(b.right());
+
+    return proxy_type(lp, rp);
+  }
+};
+
+template <dimension_type                         D,
+	  template <typename, typename,typename> class O,
+	  typename                               Block1, typename Type1,
+	  typename                               Block2, typename Type2,
+	  typename                               Block3, typename Type3,
+	  bool A>
+struct Proxy_factory<Ternary_expr_block<D, O,
+  Block1,Type1,Block2,Type2,Block3,Type3> const, A>
+{
+  typedef Ternary_access_traits<typename Proxy_factory<Block1, A>::proxy_type,
+                                typename Proxy_factory<Block2, A>::proxy_type,
+                                typename Proxy_factory<Block3, A>::proxy_type,
+		 	        O> 
+    access_traits;
+
+  typedef Ternary_expr_block<D, O, Block1,Type1,Block2,Type2,Block3,Type3>
+    SrcBlock;
+
+  typedef Proxy<access_traits, A> proxy_type;
+  static bool const ct_valid = 
+    Ternary_operator_map<Type1, O>::is_supported &&
+    Proxy_factory<Block1, A>::ct_valid &&
+    Proxy_factory<Block2, A>::ct_valid &&
+    Proxy_factory<Block3, A>::ct_valid;
+
+  static bool 
+  rt_valid(SrcBlock const &b, int alignment)
+  {
+    return Proxy_factory<Block1, A>::rt_valid(b.first(), alignment) &&
+           Proxy_factory<Block2, A>::rt_valid(b.second(), alignment) &&
+           Proxy_factory<Block3, A>::rt_valid(b.third(), alignment);
+  }
+
+  static proxy_type
+  create(SrcBlock const &b)
+  {
+    typename Proxy_factory<Block1, A>::proxy_type
+      b1p = Proxy_factory<Block1, A>::create(b.first());
+    typename Proxy_factory<Block2, A>::proxy_type
+      b2p = Proxy_factory<Block2, A>::create(b.second());
+    typename Proxy_factory<Block3, A>::proxy_type
+      b3p = Proxy_factory<Block3, A>::create(b.third());
+
+    return proxy_type(b1p,b2p,b3p);
+  }
+};
+
+
+} // namespace vsip::impl::simd
+} // namespace vsip::impl
+} // namespace vsip
+
+#endif // VSIP_IMPL_SIMD_PROXY_FACTORY_HPP
Index: src/vsip/opt/simd/eval_unaligned.hpp
===================================================================
--- src/vsip/opt/simd/eval_unaligned.hpp	(revision 0)
+++ src/vsip/opt/simd/eval_unaligned.hpp	(revision 0)
@@ -0,0 +1,121 @@
+/* Copyright (c) 2006, 2007, 2008 by CodeSourcery.  All rights reserved.
+
+   This file is available for license from CodeSourcery, Inc. under the terms
+   of a commercial license and under the GPL.  It is not part of the VSIPL++
+   reference implementation and is not available under the BSD license.
+*/
+/** @file    vsip/opt/simd/eval_unaligned.hpp
+    @author  Stefan Seefeld
+    @date    2006-07-25
+    @brief   VSIPL++ Library: SIMD expression evaluator logic.
+
+*/
+
+#ifndef VSIP_IMPL_SIMD_EVAL_UNALIGNED_HPP
+#define VSIP_IMPL_SIMD_EVAL_UNALIGNED_HPP
+
+#if VSIP_IMPL_REF_IMPL
+# error "vsip/opt files cannot be used as part of the reference impl."
+#endif
+
+/***********************************************************************
+  Included Files
+***********************************************************************/
+
+#include <vsip/support.hpp>
+#include <vsip/opt/simd/simd.hpp>
+#include <vsip/opt/simd/expr_iterator.hpp>
+#include <vsip/core/expr/operations.hpp>
+#include <vsip/core/expr/unary_block.hpp>
+#include <vsip/core/expr/binary_block.hpp>
+#include <vsip/core/metaprogramming.hpp>
+#include <vsip/core/extdata.hpp>
+#include <vsip/opt/expr/serial_evaluator.hpp>
+#include <vsip/opt/simd/proxy_factory.hpp>
+
+/***********************************************************************
+  Definitions
+***********************************************************************/
+
+namespace vsip
+{
+namespace impl
+{
+
+// SIMD Loop Fusion evaluator for unaligned expressions.
+//
+// Handles expressions where the result is aligned, but the operands
+// are unaligned.
+
+template <typename LB,
+	  typename RB>
+struct Serial_expr_evaluator<1, LB, RB, Simd_unaligned_loop_fusion_tag>
+{
+  typedef typename Adjust_layout_dim<
+                     1, typename Block_layout<LB>::layout_type>::type
+		layout_type;
+
+  static char const* name() { return "Expr_SIMD_Unaligned_Loop"; }
+  
+  static bool const ct_valid =
+    // Is SIMD supported at all ?
+    simd::Simd_traits<typename LB::value_type>::is_accel &&
+    // Check that direct access is possible.
+    Ext_data_cost<LB>::value == 0 &&
+    simd::Proxy_factory<RB, false>::ct_valid &&
+    // Only allow float, double, complex<float>,
+    // and complex<double> at this time.
+    (Type_equal<typename Scalar_of<typename LB::value_type>::type, float>::value ||
+     Type_equal<typename Scalar_of<typename LB::value_type>::type, double>::value) &&
+    // Make sure both sides have the same type.
+    Type_equal<typename LB::value_type, typename RB::value_type>::value &&
+    // Make sure the left side is not a complex split block.
+    !Is_split_block<LB>::value;
+
+
+  static bool rt_valid(LB& lhs, RB const& rhs)
+  {
+    Ext_data<LB, layout_type> dda(lhs, SYNC_OUT);
+    return (dda.stride(0) == 1 &&
+	    simd::Simd_traits<typename LB::value_type>::
+	      alignment_of(dda.data()) == 0 &&
+	    simd::Proxy_factory<RB, false>::rt_valid(rhs, 0));
+  }
+
+  static void exec(LB& lhs, RB const& rhs)
+  {
+    typedef typename simd::LValue_access_traits<typename LB::value_type> WAT;
+    typedef typename simd::Proxy_factory<RB, false>::access_traits EAT;
+
+    length_type const vec_size =
+      simd::Simd_traits<typename LB::value_type>::vec_size;
+    Ext_data<LB, layout_type> dda(lhs, SYNC_OUT);
+
+    simd::Proxy<WAT,true>  lp(dda.data());
+    simd::Proxy<EAT,false> rp(simd::Proxy_factory<RB,false>::create(rhs));
+
+    length_type const size = dda.size(0);
+    length_type n = size;
+
+    // loop using proxy interface. This generates the best code
+    // with gcc 3.4 (with gcc 4.1 the difference to the first case
+    // above is negligible).
+
+    while (n >= vec_size)
+    {
+      lp.store(rp.load());
+      n -= vec_size;
+      lp.increment();
+      rp.increment();
+    }
+
+    // Process the remainder, using simple loop fusion.
+    for (index_type i = size - n; i != size; ++i) lhs.put(i, rhs.get(i));
+  }
+};
+
+
+} // namespace vsip::impl
+} // namespace vsip
+
+#endif // VSIP_IMPL_SIMD_EVAL_UNALIGNED_HPP
Index: src/vsip/opt/simd/expr_iterator.hpp
===================================================================
--- src/vsip/opt/simd/expr_iterator.hpp	(revision 191870)
+++ src/vsip/opt/simd/expr_iterator.hpp	(working copy)
@@ -327,34 +327,34 @@
   typedef typename simd::perm_simd_type  perm_simd_type;
   typedef typename simd::value_type      value_type;
 
-  Simd_unaligned_loader(value_type const* ptr) : ptr_unaligned_(ptr) 
+  Simd_unaligned_loader(value_type const* ptr)
   {
     ptr_aligned_    = (value_type*)((intptr_t)ptr & ~(simd::alignment-1));
 
     x0_  = simd::load((value_type*)ptr_aligned_);
-    x1_  = simd::load((value_type*)(ptr_aligned_+simd::vec_size));
-    sh_  = simd::shift_for_addr(ptr_unaligned_);
+    sh_  = simd::shift_for_addr(ptr);
   }
 
   simd_type load() const
-  { return simd::perm(x0_, x1_, sh_); }
+  {
+    x1_  = simd::load((value_type*)(ptr_aligned_+simd::vec_size));
+    return simd::perm(x0_, x1_, sh_);
+  }
 
   void increment(length_type n = 1)
   {
-    ptr_unaligned_ += n * simd::vec_size;
     ptr_aligned_   += n * simd::vec_size;
   
-    // update x0
+    // Update x0.
+    //
+    // Note: this requires load() to be called at least once before each
+    //       call to increment().
     x0_ = (n == 1) ? x1_ : simd::load((value_type*)ptr_aligned_);
-
-    // update x1
-    x1_ = simd::load((value_type*)(ptr_aligned_+simd::vec_size));
   }
 
-  value_type const*            ptr_unaligned_;
   value_type const*            ptr_aligned_;
   simd_type                    x0_;
-  simd_type                    x1_;
+  mutable simd_type            x1_;
   perm_simd_type               sh_;
 
 };
@@ -568,7 +568,7 @@
   AB const &left() const { return left_;}
   C const &right() const { return right_;}
 
-  simd_type load() const 
+  simd_type load() const
   {
     simd_type a = left_.left().load();
     simd_type b = left_.right().load();
Index: src/vsip/opt/expr/serial_dispatch.hpp
===================================================================
--- src/vsip/opt/expr/serial_dispatch.hpp	(revision 191870)
+++ src/vsip/opt/expr/serial_dispatch.hpp	(working copy)
@@ -47,6 +47,9 @@
 #ifdef VSIP_IMPL_HAVE_SIMD_LOOP_FUSION
 #  include <vsip/opt/simd/expr_evaluator.hpp>
 #endif
+#ifdef VSIP_IMPL_HAVE_SIMD_UNALIGNED_LOOP_FUSION
+#  include <vsip/opt/simd/eval_unaligned.hpp>
+#endif
 #ifdef VSIP_IMPL_HAVE_SIMD_GENERIC
 #  include <vsip/opt/simd/eval_generic.hpp>
 #endif
Index: doc/quickstart/quickstart.xml
===================================================================
--- doc/quickstart/quickstart.xml	(revision 192274)
+++ doc/quickstart/quickstart.xml	(working copy)
@@ -1215,6 +1215,40 @@
      </varlistentry>
 
      <varlistentry>
+      <term><option>--enable-simd-loop-fusion</option></term>
+      <listitem>
+       <para>
+        Enable VSIPL++ to generate SIMD instructions for loop-fusion
+        expressions (containing data that is SIMD aligned).
+
+        This option is useful for increasing performance of many
+        VSIPL++ expressions on platforms with SIMD instruction
+        set extensions (such as Intel SSE, or Power VMX/AltiVec).
+
+        The default is not to generate SIMD instructions.
+       </para>
+      </listitem>
+     </varlistentry>
+
+     <varlistentry>
+      <term><option>--enable-simd-unaligned-loop-fusion</option></term>
+      <listitem>
+       <para>
+        Enable VSIPL++ to generate SIMD instructions for loop-fusion
+        expressions, possibly containing data that is SIMD unaligned.
+
+        This option is useful for increasing performance of VSIPL++
+        expressions that work with unaligned data on platforms with
+        SIMD instruction set extensions (such as Intel SSE, or Power
+        VMX/AltiVec).
+
+        The default is to follow the setting of
+        <option>--enable-simd-loop-fusion</option>.
+       </para>
+      </listitem>
+     </varlistentry>
+
+     <varlistentry>
       <term><option>--with-complex=<replaceable>format</replaceable></option></term>
       <listitem>
        <para>
Index: src/vsip/opt/simd/rscvmul.hpp
===================================================================
--- src/vsip/opt/simd/rscvmul.hpp	(revision 191870)
+++ src/vsip/opt/simd/rscvmul.hpp	(working copy)
@@ -113,7 +113,7 @@
     }
 
     // clean up initial unaligned values
-    while (simd::alignment_of((T*)B) != 0)
+    while (n && simd::alignment_of((T*)B) != 0)
     {
       *R = alpha * *B;
       R++; B++;
@@ -196,8 +196,10 @@
     T                        alpha,
     std::pair<T*, T*> const& B,
     std::pair<T*, T*> const& R,
-    int n)
+    int                      n)
   {
+    assert(n >= 0);
+
     typedef Simd_traits<T> simd;
     typedef typename simd::simd_type simd_type;
     
@@ -225,7 +227,7 @@
     }
 
     // clean up initial unaligned values
-    while (simd::alignment_of(pRr) != 0)
+    while (n && simd::alignment_of(pRr) != 0)
     {
       *pRr = alpha * *pBr;
       *pRi = alpha * *pBi;
Index: src/vsip/opt/simd/threshold.hpp
===================================================================
--- src/vsip/opt/simd/threshold.hpp	(revision 191870)
+++ src/vsip/opt/simd/threshold.hpp	(working copy)
@@ -178,7 +178,7 @@
     }
 
     // clean up initial unaligned values
-    while (simd::alignment_of(A) != 0)
+    while (n && simd::alignment_of(A) != 0)
     {
       if(O<T,T>::apply(*A,*B)) *Z = *A;
       else *Z = k;
Index: src/vsip/opt/simd/vgt.hpp
===================================================================
--- src/vsip/opt/simd/vgt.hpp	(revision 191870)
+++ src/vsip/opt/simd/vgt.hpp	(working copy)
@@ -114,7 +114,7 @@
   }
 
   // clean up initial unaligned values
-  while (simd::alignment_of(A) != 0)
+  while (n && simd::alignment_of(A) != 0)
   {
     *R = *A > *B;
     R++; A++; B++;
Index: src/vsip/opt/simd/vma_ip_csc.hpp
===================================================================
--- src/vsip/opt/simd/vma_ip_csc.hpp	(revision 191870)
+++ src/vsip/opt/simd/vma_ip_csc.hpp	(working copy)
@@ -113,7 +113,7 @@
     }
 
     // clean up initial unaligned values
-    while (simd::alignment_of((T*)R) != 0)
+    while (n && simd::alignment_of((T*)R) != 0)
     {
       *R += a * *B;
       R++; B++;
Index: src/vsip/opt/simd/vaxpy.hpp
===================================================================
--- src/vsip/opt/simd/vaxpy.hpp	(revision 191870)
+++ src/vsip/opt/simd/vaxpy.hpp	(working copy)
@@ -116,7 +116,7 @@
     }
 
     // clean up initial unaligned values
-    while (simd::alignment_of((T*)R) != 0)
+    while (n && simd::alignment_of((T*)R) != 0)
     {
       *R = a * *B + *C;
       R++; B++; C++;
Index: src/vsip/opt/simd/vadd.hpp
===================================================================
--- src/vsip/opt/simd/vadd.hpp	(revision 191870)
+++ src/vsip/opt/simd/vadd.hpp	(working copy)
@@ -113,7 +113,7 @@
     }
 
     // clean up initial unaligned values
-    while (simd::alignment_of(A) != 0)
+    while (n && simd::alignment_of(A) != 0)
     {
       *R = *A + *B;
       R++; A++; B++;
Index: src/vsip/opt/simd/vmul.hpp
===================================================================
--- src/vsip/opt/simd/vmul.hpp	(revision 191870)
+++ src/vsip/opt/simd/vmul.hpp	(working copy)
@@ -113,7 +113,7 @@
     }
 
     // clean up initial unaligned values
-    while (simd::alignment_of(A) != 0)
+    while (n && simd::alignment_of(A) != 0)
     {
       *R = *A * *B;
       R++; A++; B++;
@@ -191,7 +191,7 @@
     }
 
     // clean up initial unaligned values
-    while (simd::alignment_of((T*)A) != 0)
+    while (n && simd::alignment_of((T*)A) != 0)
     {
       *R = *A * *B;
       R++; A++; B++;
@@ -329,7 +329,7 @@
     }
 
     // clean up initial unaligned values
-    while (simd::alignment_of(pRr) != 0)
+    while (n && simd::alignment_of(pRr) != 0)
     {
       T rr = *pAr * *pBr - *pAi * *pBi;
       *pRi = *pAr * *pBi + *pAi * *pBr;
Index: src/vsip/opt/simd/vlogic.hpp
===================================================================
--- src/vsip/opt/simd/vlogic.hpp	(revision 191870)
+++ src/vsip/opt/simd/vlogic.hpp	(working copy)
@@ -278,7 +278,7 @@
     }
 
     // clean up initial unaligned values
-    while (traits::alignment_of((SimdValueT*)A) != 0)
+    while (n && traits::alignment_of((SimdValueT*)A) != 0)
     {
       *R = FunctionT::exec(*A);
       R++; A++;
@@ -386,7 +386,7 @@
     }
 
     // clean up initial unaligned values
-    while (traits::alignment_of((SimdValueT*)A) != 0)
+    while (n && traits::alignment_of((SimdValueT*)A) != 0)
     {
       *R = FunctionT::exec(*A, *B);
       R++; A++; B++;
Index: tests/regressions/view_offset.cpp
===================================================================
--- tests/regressions/view_offset.cpp	(revision 0)
+++ tests/regressions/view_offset.cpp	(revision 0)
@@ -0,0 +1,236 @@
+/* Copyright (c) 2008 by CodeSourcery, LLC.  All rights reserved. */
+
+
+/** @file    tests/view_offset.cpp
+    @author  Jules Bergmann
+    @date    2008-02-22
+    @brief   VSIPL++ Library: Regression test for small (less than SIMD
+             width), unaligned element-wise vector operations that triggered
+	     a bug in the built-in generic SIMD routines.
+     
+*/
+
+/***********************************************************************
+  Included Files
+***********************************************************************/
+
+#define VERBOSE 0
+
+#if VERBOSE
+#  include <iostream>
+#endif
+
+#include <vsip/initfin.hpp>
+#include <vsip/support.hpp>
+#include <vsip/vector.hpp>
+#include <vsip/matrix.hpp>
+#include <vsip/domain.hpp>
+#include <vsip/random.hpp>
+
+#include <vsip_csl/test.hpp>
+
+using namespace vsip;
+using namespace vsip_csl;
+
+
+/***********************************************************************
+  Definitions - Utility Functions
+***********************************************************************/
+
+template <typename T>
+void
+test_vadd(
+  length_type len,
+  length_type offset1,
+  length_type offset2,
+  length_type offset3)
+{
+  Rand<T> gen(0, 0);
+
+  Vector<T> big_A(len + offset1);
+  Vector<T> big_B(len + offset2);
+  Vector<T> big_Z(len + offset3);
+
+  typename Vector<T>::subview_type A = big_A(Domain<1>(offset1, 1, len));
+  typename Vector<T>::subview_type B = big_B(Domain<1>(offset2, 1, len));
+  typename Vector<T>::subview_type Z = big_Z(Domain<1>(offset3, 1, len));
+
+  A = gen.randu(len);
+  B = gen.randu(len);
+
+  Z = A + B;
+
+  for (index_type i=0; i<len; ++i)
+  {
+    test_assert(Almost_equal<T>::eq(Z.get(i), A.get(i) + B.get(i)));
+  }
+}
+
+
+
+template <typename T>
+void
+test_vma_cSC(
+  length_type len,
+  length_type offset1,
+  length_type offset2,
+  length_type offset3)
+{
+  typedef typename vsip::impl::Scalar_of<T>::type ST;
+
+  Rand<ST> rgen(0, 0);
+  Rand<T>  cgen(0, 0);
+
+  Vector<ST> big_B(len + offset1);
+  Vector<T>  big_C(len + offset2);
+  Vector<T>  big_Z(len + offset3);
+
+  T a = 2.0;
+  typename Vector<ST>::subview_type B = big_B(Domain<1>(offset1, 1, len));
+  typename Vector<T>::subview_type  C = big_C(Domain<1>(offset2, 1, len));
+  typename Vector<T>::subview_type  Z = big_Z(Domain<1>(offset3, 1, len));
+
+  B = rgen.randu(len);
+  C = cgen.randu(len);
+
+  Z = a * B + C;
+
+  for (index_type i=0; i<len; ++i)
+  {
+    test_assert(Almost_equal<T>::eq(Z.get(i), a * B.get(i) + C.get(i)));
+  }
+}
+
+
+
+template <typename T>
+void
+test_vmul(
+  length_type len,
+  length_type offset1,
+  length_type offset2,
+  length_type offset3)
+{
+  Rand<T> gen(0, 0);
+
+  Vector<T> big_A(len + offset1);
+  Vector<T> big_B(len + offset2);
+  Vector<T> big_Z(len + offset3);
+
+  typename Vector<T>::subview_type A = big_A(Domain<1>(offset1, 1, len));
+  typename Vector<T>::subview_type B = big_B(Domain<1>(offset2, 1, len));
+  typename Vector<T>::subview_type Z = big_Z(Domain<1>(offset3, 1, len));
+
+  A = gen.randu(len);
+  B = gen.randu(len);
+
+  Z = A * B;
+
+  for (index_type i=0; i<len; ++i)
+  {
+#if VERBOSE
+    if (!equal(Z.get(i), A.get(i) * B.get(i)))
+    {
+      std::cout << "Z(" << i << ")        = " << Z(i) << std::endl;
+      std::cout << "A(" << i << ") * B(" << i << ") = "
+		<< A(i) * B(i) << std::endl;
+    }
+#endif
+    test_assert(Almost_equal<T>::eq(Z.get(i), A.get(i) * B.get(i)));
+  }
+}
+
+
+
+template <typename T>
+void
+test_vthresh(
+  length_type len,
+  length_type offset1,
+  length_type offset2,
+  length_type offset3)
+{
+  Rand<T> gen(0, 0);
+
+  Vector<T> big_A(len + offset1);
+  Vector<T> big_B(len + offset2);
+  Vector<T> big_Z(len + offset3);
+
+  typename Vector<T>::subview_type A = big_A(Domain<1>(offset1, 1, len));
+  typename Vector<T>::subview_type B = big_B(Domain<1>(offset2, 1, len));
+  typename Vector<T>::subview_type Z = big_Z(Domain<1>(offset3, 1, len));
+  T                                k = 0.5;
+
+  A = gen.randu(len);
+  B = gen.randu(len);
+
+  Z = ite(A > B, A, k);
+
+  for (index_type i=0; i<len; ++i)
+  {
+    test_assert(Almost_equal<T>::eq(Z.get(i), A.get(i) > B.get(i) ? A.get(i) : k));
+  }
+}
+
+
+
+
+template <typename T>
+void
+test_sweep()
+{
+  for (index_type i=1; i<=128; ++i)
+  {
+    // 080222: These broke built-in SIMD functions when i < vector size.
+    test_vmul<T>(i, 1, 1, 1);
+    test_vadd<T>(i, 1, 1, 1);
+
+    // 080222: This would have been broken if it was being dispatched to.
+    test_vma_cSC<T>(i, 1, 1, 1);
+
+    // These work fine.
+    test_vmul<T>(i, 0, 0, 0);
+    test_vmul<T>(i, 1, 0, 0);
+    test_vmul<T>(i, 0, 1, 0);
+    test_vmul<T>(i, 0, 0, 1);
+
+    test_vadd<T>(i, 0, 0, 0);
+    test_vadd<T>(i, 1, 0, 0);
+    test_vadd<T>(i, 0, 1, 0);
+    test_vadd<T>(i, 0, 0, 1);
+
+    test_vma_cSC<T>(i, 0, 0, 0);
+    test_vma_cSC<T>(i, 1, 0, 0);
+    test_vma_cSC<T>(i, 0, 1, 0);
+    test_vma_cSC<T>(i, 0, 0, 1);
+  }
+}
+
+template <typename T>
+void
+test_sweep_real()
+{
+  for (index_type i=1; i<=128; ++i)
+  {
+    test_vthresh<T>(i, 1, 1, 1);
+
+    test_vthresh<T>(i, 0, 0, 0);
+    test_vthresh<T>(i, 1, 0, 0);
+    test_vthresh<T>(i, 0, 1, 0);
+    test_vthresh<T>(i, 0, 0, 1);
+  }
+}
+
+
+
+
+int
+main(int argc, char** argv)
+{
+  vsipl init(argc, argv);
+
+  test_sweep<float          >();
+  test_sweep<complex<float> >();
+
+  test_sweep_real<float>();
+}
Index: src/vsip_csl/img/impl/pwarp_simd.hpp
===================================================================
--- src/vsip_csl/img/impl/pwarp_simd.hpp	(revision 191870)
+++ src/vsip_csl/img/impl/pwarp_simd.hpp	(working copy)
@@ -408,6 +408,16 @@
 	bool_simd_t vec_1_good  = ui_simd::band(vec_u1_good, vec_v1_good);
 	bool_simd_t vec_2_good  = ui_simd::band(vec_u2_good, vec_v2_good);
 	bool_simd_t vec_3_good  = ui_simd::band(vec_u3_good, vec_v3_good);
+
+	// Clear u/v if out of bounds.
+	vec_u0 = simd::band(vec_0_good, vec_u0);
+	vec_u1 = simd::band(vec_1_good, vec_u1);
+	vec_u2 = simd::band(vec_2_good, vec_u2);
+	vec_u3 = simd::band(vec_3_good, vec_u3);
+	vec_v0 = simd::band(vec_0_good, vec_v0);
+	vec_v1 = simd::band(vec_1_good, vec_v1);
+	vec_v2 = simd::band(vec_2_good, vec_v2);
+	vec_v3 = simd::band(vec_3_good, vec_v3);
 	
 #if __PPU__
 	us_simd_t vec_s01_good = (us_simd_t)vec_pack(vec_0_good, vec_1_good);
@@ -518,22 +528,22 @@
 	ui_simd::extract_all(vec_2_offset, off_20, off_21, off_22, off_23);
 	ui_simd::extract_all(vec_3_offset, off_30, off_31, off_32, off_33);
 	
-	T* p_00 = p_in + off_00;
-	T* p_01 = p_in + off_01;
-	T* p_02 = p_in + off_02;
-	T* p_03 = p_in + off_03;
-	T* p_10 = p_in + off_10;
-	T* p_11 = p_in + off_11;
-	T* p_12 = p_in + off_12;
-	T* p_13 = p_in + off_13;
-	T* p_20 = p_in + off_20;
-	T* p_21 = p_in + off_21;
-	T* p_22 = p_in + off_22;
-	T* p_23 = p_in + off_23;
-	T* p_30 = p_in + off_30;
-	T* p_31 = p_in + off_31;
-	T* p_32 = p_in + off_32;
-	T* p_33 = p_in + off_33;
+	T* p_00 = p_in + off_00; assert(off_00 <= rows*cols);
+	T* p_01 = p_in + off_01; assert(off_01 <= rows*cols);
+	T* p_02 = p_in + off_02; assert(off_02 <= rows*cols);
+	T* p_03 = p_in + off_03; assert(off_03 <= rows*cols);
+	T* p_10 = p_in + off_10; assert(off_10 <= rows*cols);
+	T* p_11 = p_in + off_11; assert(off_11 <= rows*cols);
+	T* p_12 = p_in + off_12; assert(off_12 <= rows*cols);
+	T* p_13 = p_in + off_13; assert(off_13 <= rows*cols);
+	T* p_20 = p_in + off_20; assert(off_20 <= rows*cols);
+	T* p_21 = p_in + off_21; assert(off_21 <= rows*cols);
+	T* p_22 = p_in + off_22; assert(off_22 <= rows*cols);
+	T* p_23 = p_in + off_23; assert(off_23 <= rows*cols);
+	T* p_30 = p_in + off_30; assert(off_30 <= rows*cols);
+	T* p_31 = p_in + off_31; assert(off_31 <= rows*cols);
+	T* p_32 = p_in + off_32; assert(off_32 <= rows*cols);
+	T* p_33 = p_in + off_33; assert(off_33 <= rows*cols);
 
 	T z00_00 =  *p_00;
 	T z10_00 = *(p_00 + in_stride_0);
Index: tests/vsip_csl/pwarp.cpp
===================================================================
--- tests/vsip_csl/pwarp.cpp	(revision 191870)
+++ tests/vsip_csl/pwarp.cpp	(working copy)
@@ -12,14 +12,16 @@
 
 #define VERBOSE 1
 #define SAVE_IMAGES 0
-#define DO_CHECK 1
+#define DO_CHECK 0
+#define TEST_TYPES 1
 
-#define NUM_TCS 4
+#define NUM_TCS 6
 
 #if VERBOSE
 #  include <iostream>
 #endif
 #include <string>
+#include <sstream>
 
 #include <vsip/initfin.hpp>
 #include <vsip/support.hpp>
@@ -177,6 +179,36 @@
     P(1, 0) = 0; P(1, 1) = 1; P(1, 2) = 0;
     P(2, 0) = 0; P(2, 1) = 0; P(2, 2) = 1;
     break;
+
+  case 4: // Random projection #3, extracted from example application.
+          // Broke SPU input streaming for VGA images.
+    P(0, 0) = 1.00202;
+    P(0, 1) = 0.00603114;
+    P(0, 2) = 1.03277;
+
+    P(1, 0) = 0.000532397;
+    P(1, 1) = 1.01655;
+    P(1, 2) = 1.66292;
+
+    P(2, 0) = 1.40122e-06;
+    P(2, 1) = 1.05832e-05;
+    P(2, 2) = 1.00002;
+    break;
+
+  case 5: // Random projection #4, extracted from example application.
+          // Broke SIMD for VGA images.
+    P(0, 0) = 1.00504661;
+    P(0, 1) = 0.0150403921;
+    P(0, 2) = 9.60451126;
+
+    P(1, 0) = 0.00317225;
+    P(1, 1) = 1.04547524;
+    P(1, 2) = 16.1063614;
+
+    P(2, 0) = 2.21413484e-06;
+    P(2, 1) = 2.5766507e-05;
+    P(2, 2) = 1.00024176;
+    break;
   }
 }
 
@@ -449,7 +481,8 @@
   typedef typename Perspective_warp<CoeffT, T, interp_linear, forward>
     ::impl_tag impl_tag;
   std::cout << f_prefix
-	    << " (" << Dispatch_name<impl_tag>::name() << ")"
+	    << " (" << Dispatch_name<impl_tag>::name() << ") "
+	    << rows << " x " << cols << " "
 	    << " tc: " << tc 
 	    << "  error: " << error1 << ", " << error2 << std::endl;
 #else
@@ -489,13 +522,45 @@
   length_type row_size,
   length_type col_size)
 {
-  test_pwarp_obj<CoeffT, T>(f_prefix + "-0", rows,cols, row_size,col_size, 0);
-  test_pwarp_obj<CoeffT, T>(f_prefix + "-1", rows,cols, row_size,col_size, 1);
-  test_pwarp_obj<CoeffT, T>(f_prefix + "-2", rows,cols, row_size,col_size, 2);
-  test_pwarp_obj<CoeffT, T>(f_prefix + "-3", rows,cols, row_size,col_size, 3);
+  for (index_type i=0; i<NUM_TCS; ++i)
+  {
+    std::ostringstream filename;
+    filename << f_prefix << "-" << i;
+    test_pwarp_obj<CoeffT, T>(filename.str(), rows,cols, row_size,col_size, i);
+  }
 }
 
 
+#if TEST_TYPES
+void
+test_types(
+  length_type rows,
+  length_type cols,
+  length_type r_size,
+  length_type c_size)
+{
+  typedef unsigned char byte_t;
+
+#if TEST_LEVEL >= 2
+  // Cool types, but not that useful in practice.
+  test_perspective_fun<double, double>("double", rows, cols, r_size, c_size);
+  test_perspective_fun<double, float> ("dfloat", rows, cols, r_size, c_size);
+  test_perspective_fun<double, byte_t>("duchar", rows, cols, r_size, c_size);
+
+  test_perspective_obj<double, float> ("obj-dfloat",rows,cols,r_size,c_size);
+  test_perspective_obj<double, double>("obj-double",rows,cols,r_size,c_size);
+  test_perspective_obj<double, byte_t>("obj-duchar",rows,cols,r_size,c_size);
+#endif
+
+  test_perspective_fun<float,  float> ("float",  rows, cols, r_size, c_size);
+  test_perspective_fun<float,  byte_t>("uchar",  rows, cols, r_size, c_size);
+
+  test_perspective_obj<float,  float> ("obj-float", rows,cols, r_size, c_size);
+  test_perspective_obj<float,  byte_t>("obj-uchar", rows,cols, r_size, c_size);
+}
+#endif
+
+
 int
 main(int argc, char** argv)
 {
@@ -503,27 +568,14 @@
 
   test_apply_proj<double>();
 
-#if 0
-  test_perspective_fun<double, double>       ("double", 480, 640, 32, 16);
-  test_perspective_fun<double, float>        ("dfloat", 480, 640, 32, 16);
-  test_perspective_fun<float,  float>        ("float",  480, 640, 32, 16);
-  test_perspective_fun<double, unsigned char>("duchar", 480, 640, 32, 16);
-  test_perspective_fun<float,  unsigned char>("uchar",  480, 640, 32, 16);
-
-  test_perspective_obj<double, float>        ("obj-dfloat", 480, 640, 32, 16);
-  test_perspective_obj<double, double>       ("obj-double", 480, 640, 32, 16);
-  test_perspective_obj<float,  float>        ("obj-float",  480, 640, 32, 16);
-  test_perspective_obj<double, unsigned char>("obj-duchar", 480, 640, 32, 16);
-  test_perspective_obj<float,  unsigned char>("obj-uchar",  480, 640, 32, 16);
+#if TEST_TYPES
+  test_types(1080, 1920, 32, 16);
+  test_types(480,   640, 32, 16);
+  test_types(512,   512, 32, 16);
 #endif
 
-  test_perspective_fun<double, double>      ("fun-double", 512, 512, 32, 16);
-  test_perspective_fun<double, float>       ("fun-dfloat", 512, 512, 32, 16);
-  test_perspective_fun<float, float>        ("fun-float",  512, 512, 32, 16);
-  test_perspective_fun<float, unsigned char>("fun-uchar",  512, 512, 32, 16);
-
-  test_perspective_obj<double, double>      ("obj-double", 512, 512, 32, 16);
-  test_perspective_obj<double, float>       ("obj-dfloat", 512, 512, 32, 16);
-  test_perspective_obj<float, float>        ("obj-float",  512, 512, 32, 16);
-  test_perspective_obj<float, unsigned char>("obj-uchar",  512, 512, 32, 16);
+  // Standalone examples for debugging.
+  // test_perspective_obj<float, byte_t>("obj-uchar", 1080, 1920, 32, 16);
+  // test_pwarp_obj<float, byte_t>("obj-uchar", 480, 640, 32, 16, 5);
+  // test_pwarp_obj<float, byte_t>("obj-uchar", 1080, 1920, 32, 16, 5);
 }
Index: tests/regressions/transpose_assign.cpp
===================================================================
--- tests/regressions/transpose_assign.cpp	(revision 191870)
+++ tests/regressions/transpose_assign.cpp	(working copy)
@@ -19,6 +19,7 @@
 ***********************************************************************/
 
 #include <memory>
+#include <iostream>
 
 #include <vsip/initfin.hpp>
 #include <vsip/support.hpp>
@@ -66,8 +67,9 @@
 	  typename DstOrderT,
 	  typename SrcOrderT>
 void
-cover_hl()
+cover_hl(int verbose)
 {
+  if (verbose >= 1) std::cout << "cover_hl\n";
   // These tests fail for Intel C++ 9.1 for Windows prior
   // to workaround in fast-transpose.hpp:
   test_hl<T, DstOrderT, SrcOrderT>(5, 3);  // known bad case
@@ -78,19 +80,25 @@
     length_type max_rows = 32;
     length_type max_cols = 32;
     for (index_type rows=1; rows<max_rows; ++rows)
+    {
+      if (verbose >= 2) std::cout << " - " << rows << " / " << max_rows << "\n";
       for (index_type cols=1; cols<max_cols; ++cols)
 	test_hl<T, DstOrderT, SrcOrderT>(rows, cols);
+    }
   }
 
   {
     length_type max_rows = 256;
     length_type max_cols = 256;
     for (index_type rows=1; rows<max_rows; rows+=3)
+    {
+      if (verbose >= 2) std::cout << " - " << rows << " / " << max_rows << "\n";
       for (index_type cols=1; cols<max_cols; cols+=5)
       {
 	test_hl<T, DstOrderT, SrcOrderT>(rows, cols);
 	test_hl<T, DstOrderT, SrcOrderT>(cols, rows);
       }
+    }
   }
 }
 
@@ -128,25 +136,32 @@
 
 template <typename T>
 void
-cover_ll()
+cover_ll(int verbose)
 {
+  if (verbose >= 1) std::cout << "cover_ll\n";
   {
     length_type max_rows = 32;
     length_type max_cols = 32;
     for (index_type rows=1; rows<max_rows; ++rows)
+    {
+      if (verbose >= 2) std::cout << " - " << rows << " / " << max_rows << "\n";
       for (index_type cols=1; cols<max_cols; ++cols)
 	test_ll<T>(rows, cols);
+    }
   }
 
   {
     length_type max_rows = 256;
     length_type max_cols = 256;
     for (index_type rows=1; rows<max_rows; rows+=3)
+    {
+      if (verbose >= 2) std::cout << " - " << rows << " / " << max_rows << "\n";
       for (index_type cols=1; cols<max_cols; cols+=5)
       {
 	test_ll<T>(rows, cols);
 	test_ll<T>(cols, rows);
       }
+    }
   }
 }
 
@@ -160,11 +175,15 @@
 
   vsipl init(argc, argv);
 
-  cover_hl<float, row2_type, col2_type>();
-  cover_hl<complex<float>, row2_type, col2_type>();
+  int verbose = 0;
+  if (argc == 2 && argv[1][0] == '1') verbose = 1;
+  if (argc == 2 && argv[1][0] == '2') verbose = 2;
 
-  cover_ll<float>();
-  cover_ll<complex<float> >();
+  cover_hl<float, row2_type, col2_type>(verbose);
+  cover_hl<complex<float>, row2_type, col2_type>(verbose);
 
+  cover_ll<float>(verbose);
+  cover_ll<complex<float> >(verbose);
+
   return 0;
 }
Index: tests/threshold.cpp
===================================================================
--- tests/threshold.cpp	(revision 191870)
+++ tests/threshold.cpp	(working copy)
@@ -14,6 +14,8 @@
   Included Files
 ***********************************************************************/
 
+#include <iostream>
+
 #include <vsip/initfin.hpp>
 #include <vsip/support.hpp>
 #include <vsip/matrix.hpp>
@@ -127,6 +129,15 @@
 									\
   for (index_type i=0; i<size; ++i)					\
   {									\
+    if (!equal(C.get(i), (A.get(i) OP B.get(i) ? T(1) : T(0))))		\
+    {									\
+      std::cerr << "TEST_LVOP FAILED: i = " << i << std::endl		\
+		<< "  C.get(i): " << C.get(i) << std::endl		\
+		<< "  A.get(i): " << A.get(i) << std::endl		\
+		<< "  B.get(i): " << B.get(i) << std::endl		\
+		<< "  expected: "					\
+		<< (A.get(i) OP B.get(i) ? T(1) : T(0)) << std::endl;	\
+    }									\
     test_assert(equal(C.get(i), (A.get(i) OP B.get(i) ? T(1) : T(0))));	\
   }									\
 }