Actions

icon Post
text/html Subscribe
text/html Unsubscribe

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[vsipl++] [patch] More 1.4 fixes


  • To: VSIPL++ Developers List <vsipl++@xxxxxxxxxxxxxxxx>
  • Subject: [vsipl++] [patch] More 1.4 fixes
  • From: Jules Bergmann <jules@xxxxxxxxxxxxxxxx>
  • Date: Tue, 26 Feb 2008 12:31:34 -0500

Hopefully the last one!

The new view_offset.cpp regression test (intended to cover the SIMD builtin functions) triggers a heisenbug in SIMD unaligned loop-fusion on Power with Fedora's GCC.

This patch works around by allowing SIMD unaligned loop-fusion to be disabled in the Power binary packages.

Patch applied.

				-- Jules
--
Jules Bergmann
CodeSourcery
jules@xxxxxxxxxxxxxxxx
(650) 331-3385 x705
Index: src/vsip/opt/simd/expr_evaluator.hpp
===================================================================
--- src/vsip/opt/simd/expr_evaluator.hpp	(revision 192398)
+++ src/vsip/opt/simd/expr_evaluator.hpp	(working copy)
@@ -1,4 +1,4 @@
-/* Copyright (c) 2006, 2007 by CodeSourcery.  All rights reserved.
+/* Copyright (c) 2006, 2007, 2008 by CodeSourcery.  All rights reserved.
 
    This file is available for license from CodeSourcery, Inc. under the terms
    of a commercial license and under the GPL.  It is not part of the VSIPL++
@@ -31,6 +31,7 @@
 #include <vsip/core/metaprogramming.hpp>
 #include <vsip/core/extdata.hpp>
 #include <vsip/opt/expr/serial_evaluator.hpp>
+#include <vsip/opt/simd/proxy_factory.hpp>
 
 /***********************************************************************
   Definitions
@@ -40,213 +41,9 @@
 {
 namespace impl
 {
-namespace simd
-{
 
-template <typename BlockT, bool A>
-struct Proxy_factory
-{
-  typedef Direct_access_traits<typename BlockT::value_type> access_traits;
-  typedef Proxy<access_traits, A> proxy_type;
-  typedef typename Adjust_layout_dim<
-                     1, typename Block_layout<BlockT>::layout_type>::type
-		layout_type;
+// SIMD Loop Fusion evaluator for aligned expressions.
 
-  static bool const ct_valid = Ext_data_cost<BlockT>::value == 0 &&
-    !Is_split_block<BlockT>::value;
-
-  static bool 
-  rt_valid(BlockT const &b, int alignment)
-  {
-    Ext_data<BlockT, layout_type> dda(b, SYNC_IN);
-    return dda.stride(0) == 1 && 
-      (!A ||
-       Simd_traits<typename BlockT::value_type>::alignment_of(dda.data()) ==
-       alignment);
-  }
-
-  static int
-  alignment(BlockT const &b)
-  {
-    Ext_data<BlockT, layout_type> dda(b, SYNC_IN);
-    return Simd_traits<typename BlockT::value_type>::alignment_of(dda.data());
-  }
-
-  static proxy_type
-  create(BlockT const &b) 
-  {
-    Ext_data<BlockT, layout_type> dda(b, SYNC_IN);
-    return proxy_type(dda.data());
-  }
-};
-
-template <typename T, bool A>
-struct Proxy_factory<Scalar_block<1, T>, A>
-{
-  typedef Scalar_access_traits<T> access_traits;
-  typedef Proxy<access_traits, A> proxy_type;
-  static bool const ct_valid = true;
-
-  static bool 
-  rt_valid(Scalar_block<1, T> const &, int) {return true;}
-
-  static proxy_type
-  create(Scalar_block<1, T> const &b) 
-  {
-    return proxy_type(b.value());
-  }
-};
-
-template <dimension_type D,
-	  template <typename> class O,
-	  typename B,
-	  typename T,
-	  bool A>
-struct Proxy_factory<Unary_expr_block<D, O, B, T> const, A>
-{
-  typedef 
-    Unary_access_traits<typename Proxy_factory<B,A>::proxy_type, O>
-    access_traits;
-  typedef Proxy<access_traits,A> proxy_type;
-
-  static bool const ct_valid =
-    Unary_operator_map<T, O>::is_supported &&
-    Type_equal<typename B::value_type, T>::value &&
-    Proxy_factory<B, A>::ct_valid;
-
-  static bool 
-  rt_valid(Unary_expr_block<D, O, B, T> const &b, int alignment)
-  {
-    return Proxy_factory<B, A>::rt_valid(b.op(), alignment);
-  }
-
-  static proxy_type
-  create(Unary_expr_block<D, O, B, T> const &b)
-  {
-    return proxy_type(Proxy_factory<B, A>::create(b.op()));
-  }
-};
-
-// This proxy is specialized for unaligned blocks. If the user specifies
-// ualigned(block), this is a hint to switch to an unaligned proxy.
-template <dimension_type D,
-	  typename B,
-	  typename T,
-	  bool A>
-struct Proxy_factory<Unary_expr_block<D, unaligned_functor, B, T> const, A>
-{
-  typedef typename Proxy_factory<B, false>::access_traits access_traits;
-  typedef Proxy<access_traits,false> proxy_type;
-  static bool const ct_valid = Proxy_factory<B,false>::ct_valid;
-
-
-  static bool 
-  rt_valid(Unary_expr_block<D, unaligned_functor, B, T> const &b, int alignment)
-  {
-    return Proxy_factory<B, false>::rt_valid(b.op(), alignment);
-  }
-
-  static proxy_type
-  create(Unary_expr_block<D, unaligned_functor, B, T> const &b)
-  {
-    return proxy_type(Proxy_factory<B, false>::create(b.op()));
-  }
-};
-
-template <dimension_type                D,
-	  template <typename, typename> class O,
-	  typename                      LB,
-	  typename                      LT,
-	  typename                      RB,
-	  typename                      RT,
-	  bool A>
-struct Proxy_factory<Binary_expr_block<D, O, LB, LT, RB, RT> const, A>
-{
-  typedef
-    Binary_access_traits<typename Proxy_factory<LB, A>::proxy_type,
-			 typename Proxy_factory<RB, A>::proxy_type, O> 
-    access_traits;
-  typedef Proxy<access_traits, A> proxy_type;
-  static bool const ct_valid = 
-    Type_equal<typename LB::value_type, LT>::value &&
-    Type_equal<typename RB::value_type, RT>::value &&
-    Type_equal<LT, RT>::value &&
-    Binary_operator_map<LT, O>::is_supported &&
-    Proxy_factory<LB, A>::ct_valid &&
-    Proxy_factory<RB, A>::ct_valid;
-
-  static bool 
-  rt_valid(Binary_expr_block<D, O, LB, LT, RB, RT> const &b, int alignment)
-  {
-    return Proxy_factory<LB, A>::rt_valid(b.left(), alignment) &&
-           Proxy_factory<RB, A>::rt_valid(b.right(), alignment);
-  }
-
-  static proxy_type
-  create(Binary_expr_block<D, O, LB, LT, RB, RT> const &b)
-  {
-    typename Proxy_factory<LB, A>::proxy_type lp =
-      Proxy_factory<LB, A>::create(b.left());
-    typename Proxy_factory<RB, A>::proxy_type rp =
-      Proxy_factory<RB, A>::create(b.right());
-
-    return proxy_type(lp, rp);
-  }
-};
-
-template <dimension_type                         D,
-	  template <typename, typename,typename> class O,
-	  typename                               Block1, typename Type1,
-	  typename                               Block2, typename Type2,
-	  typename                               Block3, typename Type3,
-	  bool A>
-struct Proxy_factory<Ternary_expr_block<D, O,
-  Block1,Type1,Block2,Type2,Block3,Type3> const, A>
-{
-  typedef Ternary_access_traits<typename Proxy_factory<Block1, A>::proxy_type,
-                                typename Proxy_factory<Block2, A>::proxy_type,
-                                typename Proxy_factory<Block3, A>::proxy_type,
-		 	        O> 
-    access_traits;
-
-  typedef Ternary_expr_block<D, O, Block1,Type1,Block2,Type2,Block3,Type3>
-    SrcBlock;
-
-  typedef Proxy<access_traits, A> proxy_type;
-  static bool const ct_valid = 
-    Ternary_operator_map<Type1, O>::is_supported &&
-    Proxy_factory<Block1, A>::ct_valid &&
-    Proxy_factory<Block2, A>::ct_valid &&
-    Proxy_factory<Block3, A>::ct_valid;
-
-  static bool 
-  rt_valid(SrcBlock const &b, int alignment)
-  {
-    return Proxy_factory<Block1, A>::rt_valid(b.first(), alignment) &&
-           Proxy_factory<Block2, A>::rt_valid(b.second(), alignment) &&
-           Proxy_factory<Block3, A>::rt_valid(b.third(), alignment);
-  }
-
-  static proxy_type
-  create(SrcBlock const &b)
-  {
-    typename Proxy_factory<Block1, A>::proxy_type
-      b1p = Proxy_factory<Block1, A>::create(b.first());
-    typename Proxy_factory<Block2, A>::proxy_type
-      b2p = Proxy_factory<Block2, A>::create(b.second());
-    typename Proxy_factory<Block3, A>::proxy_type
-      b3p = Proxy_factory<Block3, A>::create(b.third());
-
-    return proxy_type(b1p,b2p,b3p);
-  }
-};
-
-
-} // namespace vsip::impl::simd
-
-
-// This evaluator is for aligned data only.
-// Look at Simd_unaligned_loop_fusion_tag for unaligned data.
 template <typename LB,
 	  typename RB>
 struct Serial_expr_evaluator<1, LB, RB, Simd_loop_fusion_tag>
@@ -326,78 +123,6 @@
   }
 };
 
-// This evaluator is for unaligned data. Any time any of the blocks are
-// unaligned, we use this evalutator. Basically, in the evaluator list, this
-// evaluator is right after the aligned evaluator and rt_valid determines
-// which one to use.
-template <typename LB,
-	  typename RB>
-struct Serial_expr_evaluator<1, LB, RB, Simd_unaligned_loop_fusion_tag>
-{
-  typedef typename Adjust_layout_dim<
-                     1, typename Block_layout<LB>::layout_type>::type
-		layout_type;
-
-  static char const* name() { return "Expr_SIMD_Unaligned_Loop"; }
-  
-  static bool const ct_valid =
-    // Is SIMD supported at all ?
-    simd::Simd_traits<typename LB::value_type>::is_accel &&
-    // Check that direct access is possible.
-    Ext_data_cost<LB>::value == 0 &&
-    simd::Proxy_factory<RB, false>::ct_valid &&
-    // Only allow float, double, complex<float>,
-    // and complex<double> at this time.
-    (Type_equal<typename Scalar_of<typename LB::value_type>::type, float>::value ||
-     Type_equal<typename Scalar_of<typename LB::value_type>::type, double>::value) &&
-    // Make sure both sides have the same type.
-    Type_equal<typename LB::value_type, typename RB::value_type>::value &&
-    // Make sure the left side is not a complex split block.
-    !Is_split_block<LB>::value;
-
-
-  static bool rt_valid(LB& lhs, RB const& rhs)
-  {
-    Ext_data<LB, layout_type> dda(lhs, SYNC_OUT);
-    return (dda.stride(0) == 1 &&
-	    simd::Simd_traits<typename LB::value_type>::
-	      alignment_of(dda.data()) == 0 &&
-	    simd::Proxy_factory<RB, false>::rt_valid(rhs, 0));
-  }
-
-  static void exec(LB& lhs, RB const& rhs)
-  {
-    typedef typename simd::LValue_access_traits<typename LB::value_type> WAT;
-    typedef typename simd::Proxy_factory<RB, false>::access_traits EAT;
-
-    length_type const vec_size =
-      simd::Simd_traits<typename LB::value_type>::vec_size;
-    Ext_data<LB, layout_type> dda(lhs, SYNC_OUT);
-
-    simd::Proxy<WAT,true>  lp(dda.data());
-    simd::Proxy<EAT,false> rp(simd::Proxy_factory<RB,false>::create(rhs));
-
-    length_type const size = dda.size(0);
-    length_type n = size;
-
-    // loop using proxy interface. This generates the best code
-    // with gcc 3.4 (with gcc 4.1 the difference to the first case
-    // above is negligible).
-
-    while (n >= vec_size)
-    {
-      lp.store(rp.load());
-      n -= vec_size;
-      lp.increment();
-      rp.increment();
-    }
-
-    // Process the remainder, using simple loop fusion.
-    for (index_type i = size - n; i != size; ++i) lhs.put(i, rhs.get(i));
-  }
-};
-
-
 } // namespace vsip::impl
 } // namespace vsip
 
Index: src/vsip/opt/simd/proxy_factory.hpp
===================================================================
--- src/vsip/opt/simd/proxy_factory.hpp	(revision 0)
+++ src/vsip/opt/simd/proxy_factory.hpp	(revision 0)
@@ -0,0 +1,249 @@
+/* Copyright (c) 2006, 2007, 2008 by CodeSourcery.  All rights reserved.
+
+   This file is available for license from CodeSourcery, Inc. under the terms
+   of a commercial license and under the GPL.  It is not part of the VSIPL++
+   reference implementation and is not available under the BSD license.
+*/
+/** @file    vsip/opt/simd/expr_evaluator.hpp
+    @author  Stefan Seefeld
+    @date    2006-07-25
+    @brief   VSIPL++ Library: SIMD expression evaluator proxy factory.
+
+*/
+
+#ifndef VSIP_IMPL_SIMD_PROXY_FACTORY_HPP
+#define VSIP_IMPL_SIMD_PROXY_FACTORY_HPP
+
+#if VSIP_IMPL_REF_IMPL
+# error "vsip/opt files cannot be used as part of the reference impl."
+#endif
+
+/***********************************************************************
+  Included Files
+***********************************************************************/
+
+#include <vsip/support.hpp>
+#include <vsip/opt/simd/simd.hpp>
+#include <vsip/opt/simd/expr_iterator.hpp>
+#include <vsip/core/expr/operations.hpp>
+#include <vsip/core/expr/unary_block.hpp>
+#include <vsip/core/expr/binary_block.hpp>
+#include <vsip/core/metaprogramming.hpp>
+#include <vsip/core/extdata.hpp>
+#include <vsip/opt/expr/serial_evaluator.hpp>
+
+/***********************************************************************
+  Definitions
+***********************************************************************/
+
+namespace vsip
+{
+namespace impl
+{
+namespace simd
+{
+
+template <typename BlockT, bool A>
+struct Proxy_factory
+{
+  typedef Direct_access_traits<typename BlockT::value_type> access_traits;
+  typedef Proxy<access_traits, A> proxy_type;
+  typedef typename Adjust_layout_dim<
+                     1, typename Block_layout<BlockT>::layout_type>::type
+		layout_type;
+
+  static bool const ct_valid = Ext_data_cost<BlockT>::value == 0 &&
+    !Is_split_block<BlockT>::value;
+
+  static bool 
+  rt_valid(BlockT const &b, int alignment)
+  {
+    Ext_data<BlockT, layout_type> dda(b, SYNC_IN);
+    return dda.stride(0) == 1 && 
+      (!A ||
+       Simd_traits<typename BlockT::value_type>::alignment_of(dda.data()) ==
+       alignment);
+  }
+
+  static int
+  alignment(BlockT const &b)
+  {
+    Ext_data<BlockT, layout_type> dda(b, SYNC_IN);
+    return Simd_traits<typename BlockT::value_type>::alignment_of(dda.data());
+  }
+
+  static proxy_type
+  create(BlockT const &b) 
+  {
+    Ext_data<BlockT, layout_type> dda(b, SYNC_IN);
+    return proxy_type(dda.data());
+  }
+};
+
+template <typename T, bool A>
+struct Proxy_factory<Scalar_block<1, T>, A>
+{
+  typedef Scalar_access_traits<T> access_traits;
+  typedef Proxy<access_traits, A> proxy_type;
+  static bool const ct_valid = true;
+
+  static bool 
+  rt_valid(Scalar_block<1, T> const &, int) {return true;}
+
+  static proxy_type
+  create(Scalar_block<1, T> const &b) 
+  {
+    return proxy_type(b.value());
+  }
+};
+
+template <dimension_type D,
+	  template <typename> class O,
+	  typename B,
+	  typename T,
+	  bool A>
+struct Proxy_factory<Unary_expr_block<D, O, B, T> const, A>
+{
+  typedef 
+    Unary_access_traits<typename Proxy_factory<B,A>::proxy_type, O>
+    access_traits;
+  typedef Proxy<access_traits,A> proxy_type;
+
+  static bool const ct_valid =
+    Unary_operator_map<T, O>::is_supported &&
+    Type_equal<typename B::value_type, T>::value &&
+    Proxy_factory<B, A>::ct_valid;
+
+  static bool 
+  rt_valid(Unary_expr_block<D, O, B, T> const &b, int alignment)
+  {
+    return Proxy_factory<B, A>::rt_valid(b.op(), alignment);
+  }
+
+  static proxy_type
+  create(Unary_expr_block<D, O, B, T> const &b)
+  {
+    return proxy_type(Proxy_factory<B, A>::create(b.op()));
+  }
+};
+
+// This proxy is specialized for unaligned blocks. If the user specifies
+// ualigned(block), this is a hint to switch to an unaligned proxy.
+template <dimension_type D,
+	  typename B,
+	  typename T,
+	  bool A>
+struct Proxy_factory<Unary_expr_block<D, unaligned_functor, B, T> const, A>
+{
+  typedef typename Proxy_factory<B, false>::access_traits access_traits;
+  typedef Proxy<access_traits,false> proxy_type;
+  static bool const ct_valid = Proxy_factory<B,false>::ct_valid;
+
+
+  static bool 
+  rt_valid(Unary_expr_block<D, unaligned_functor, B, T> const &b, int alignment)
+  {
+    return Proxy_factory<B, false>::rt_valid(b.op(), alignment);
+  }
+
+  static proxy_type
+  create(Unary_expr_block<D, unaligned_functor, B, T> const &b)
+  {
+    return proxy_type(Proxy_factory<B, false>::create(b.op()));
+  }
+};
+
+template <dimension_type                D,
+	  template <typename, typename> class O,
+	  typename                      LB,
+	  typename                      LT,
+	  typename                      RB,
+	  typename                      RT,
+	  bool A>
+struct Proxy_factory<Binary_expr_block<D, O, LB, LT, RB, RT> const, A>
+{
+  typedef
+    Binary_access_traits<typename Proxy_factory<LB, A>::proxy_type,
+			 typename Proxy_factory<RB, A>::proxy_type, O> 
+    access_traits;
+  typedef Proxy<access_traits, A> proxy_type;
+  static bool const ct_valid = 
+    Type_equal<typename LB::value_type, LT>::value &&
+    Type_equal<typename RB::value_type, RT>::value &&
+    Type_equal<LT, RT>::value &&
+    Binary_operator_map<LT, O>::is_supported &&
+    Proxy_factory<LB, A>::ct_valid &&
+    Proxy_factory<RB, A>::ct_valid;
+
+  static bool 
+  rt_valid(Binary_expr_block<D, O, LB, LT, RB, RT> const &b, int alignment)
+  {
+    return Proxy_factory<LB, A>::rt_valid(b.left(), alignment) &&
+           Proxy_factory<RB, A>::rt_valid(b.right(), alignment);
+  }
+
+  static proxy_type
+  create(Binary_expr_block<D, O, LB, LT, RB, RT> const &b)
+  {
+    typename Proxy_factory<LB, A>::proxy_type lp =
+      Proxy_factory<LB, A>::create(b.left());
+    typename Proxy_factory<RB, A>::proxy_type rp =
+      Proxy_factory<RB, A>::create(b.right());
+
+    return proxy_type(lp, rp);
+  }
+};
+
+template <dimension_type                         D,
+	  template <typename, typename,typename> class O,
+	  typename                               Block1, typename Type1,
+	  typename                               Block2, typename Type2,
+	  typename                               Block3, typename Type3,
+	  bool A>
+struct Proxy_factory<Ternary_expr_block<D, O,
+  Block1,Type1,Block2,Type2,Block3,Type3> const, A>
+{
+  typedef Ternary_access_traits<typename Proxy_factory<Block1, A>::proxy_type,
+                                typename Proxy_factory<Block2, A>::proxy_type,
+                                typename Proxy_factory<Block3, A>::proxy_type,
+		 	        O> 
+    access_traits;
+
+  typedef Ternary_expr_block<D, O, Block1,Type1,Block2,Type2,Block3,Type3>
+    SrcBlock;
+
+  typedef Proxy<access_traits, A> proxy_type;
+  static bool const ct_valid = 
+    Ternary_operator_map<Type1, O>::is_supported &&
+    Proxy_factory<Block1, A>::ct_valid &&
+    Proxy_factory<Block2, A>::ct_valid &&
+    Proxy_factory<Block3, A>::ct_valid;
+
+  static bool 
+  rt_valid(SrcBlock const &b, int alignment)
+  {
+    return Proxy_factory<Block1, A>::rt_valid(b.first(), alignment) &&
+           Proxy_factory<Block2, A>::rt_valid(b.second(), alignment) &&
+           Proxy_factory<Block3, A>::rt_valid(b.third(), alignment);
+  }
+
+  static proxy_type
+  create(SrcBlock const &b)
+  {
+    typename Proxy_factory<Block1, A>::proxy_type
+      b1p = Proxy_factory<Block1, A>::create(b.first());
+    typename Proxy_factory<Block2, A>::proxy_type
+      b2p = Proxy_factory<Block2, A>::create(b.second());
+    typename Proxy_factory<Block3, A>::proxy_type
+      b3p = Proxy_factory<Block3, A>::create(b.third());
+
+    return proxy_type(b1p,b2p,b3p);
+  }
+};
+
+
+} // namespace vsip::impl::simd
+} // namespace vsip::impl
+} // namespace vsip
+
+#endif // VSIP_IMPL_SIMD_PROXY_FACTORY_HPP
Index: src/vsip/opt/simd/eval_unaligned.hpp
===================================================================
--- src/vsip/opt/simd/eval_unaligned.hpp	(revision 0)
+++ src/vsip/opt/simd/eval_unaligned.hpp	(revision 0)
@@ -0,0 +1,121 @@
+/* Copyright (c) 2006, 2007, 2008 by CodeSourcery.  All rights reserved.
+
+   This file is available for license from CodeSourcery, Inc. under the terms
+   of a commercial license and under the GPL.  It is not part of the VSIPL++
+   reference implementation and is not available under the BSD license.
+*/
+/** @file    vsip/opt/simd/eval_unaligned.hpp
+    @author  Stefan Seefeld
+    @date    2006-07-25
+    @brief   VSIPL++ Library: SIMD expression evaluator logic.
+
+*/
+
+#ifndef VSIP_IMPL_SIMD_EVAL_UNALIGNED_HPP
+#define VSIP_IMPL_SIMD_EVAL_UNALIGNED_HPP
+
+#if VSIP_IMPL_REF_IMPL
+# error "vsip/opt files cannot be used as part of the reference impl."
+#endif
+
+/***********************************************************************
+  Included Files
+***********************************************************************/
+
+#include <vsip/support.hpp>
+#include <vsip/opt/simd/simd.hpp>
+#include <vsip/opt/simd/expr_iterator.hpp>
+#include <vsip/core/expr/operations.hpp>
+#include <vsip/core/expr/unary_block.hpp>
+#include <vsip/core/expr/binary_block.hpp>
+#include <vsip/core/metaprogramming.hpp>
+#include <vsip/core/extdata.hpp>
+#include <vsip/opt/expr/serial_evaluator.hpp>
+#include <vsip/opt/simd/proxy_factory.hpp>
+
+/***********************************************************************
+  Definitions
+***********************************************************************/
+
+namespace vsip
+{
+namespace impl
+{
+
+// SIMD Loop Fusion evaluator for unaligned expressions.
+//
+// Handles expressions where the result is aligned, but the operands
+// are unaligned.
+
+template <typename LB,
+	  typename RB>
+struct Serial_expr_evaluator<1, LB, RB, Simd_unaligned_loop_fusion_tag>
+{
+  typedef typename Adjust_layout_dim<
+                     1, typename Block_layout<LB>::layout_type>::type
+		layout_type;
+
+  static char const* name() { return "Expr_SIMD_Unaligned_Loop"; }
+  
+  static bool const ct_valid =
+    // Is SIMD supported at all ?
+    simd::Simd_traits<typename LB::value_type>::is_accel &&
+    // Check that direct access is possible.
+    Ext_data_cost<LB>::value == 0 &&
+    simd::Proxy_factory<RB, false>::ct_valid &&
+    // Only allow float, double, complex<float>,
+    // and complex<double> at this time.
+    (Type_equal<typename Scalar_of<typename LB::value_type>::type, float>::value ||
+     Type_equal<typename Scalar_of<typename LB::value_type>::type, double>::value) &&
+    // Make sure both sides have the same type.
+    Type_equal<typename LB::value_type, typename RB::value_type>::value &&
+    // Make sure the left side is not a complex split block.
+    !Is_split_block<LB>::value;
+
+
+  static bool rt_valid(LB& lhs, RB const& rhs)
+  {
+    Ext_data<LB, layout_type> dda(lhs, SYNC_OUT);
+    return (dda.stride(0) == 1 &&
+	    simd::Simd_traits<typename LB::value_type>::
+	      alignment_of(dda.data()) == 0 &&
+	    simd::Proxy_factory<RB, false>::rt_valid(rhs, 0));
+  }
+
+  static void exec(LB& lhs, RB const& rhs)
+  {
+    typedef typename simd::LValue_access_traits<typename LB::value_type> WAT;
+    typedef typename simd::Proxy_factory<RB, false>::access_traits EAT;
+
+    length_type const vec_size =
+      simd::Simd_traits<typename LB::value_type>::vec_size;
+    Ext_data<LB, layout_type> dda(lhs, SYNC_OUT);
+
+    simd::Proxy<WAT,true>  lp(dda.data());
+    simd::Proxy<EAT,false> rp(simd::Proxy_factory<RB,false>::create(rhs));
+
+    length_type const size = dda.size(0);
+    length_type n = size;
+
+    // loop using proxy interface. This generates the best code
+    // with gcc 3.4 (with gcc 4.1 the difference to the first case
+    // above is negligible).
+
+    while (n >= vec_size)
+    {
+      lp.store(rp.load());
+      n -= vec_size;
+      lp.increment();
+      rp.increment();
+    }
+
+    // Process the remainder, using simple loop fusion.
+    for (index_type i = size - n; i != size; ++i) lhs.put(i, rhs.get(i));
+  }
+};
+
+
+} // namespace vsip::impl
+} // namespace vsip
+
+#endif // VSIP_IMPL_SIMD_EVAL_UNALIGNED_HPP
Index: src/vsip/opt/simd/expr_iterator.hpp
===================================================================
--- src/vsip/opt/simd/expr_iterator.hpp	(revision 192398)
+++ src/vsip/opt/simd/expr_iterator.hpp	(working copy)
@@ -327,34 +327,31 @@
   typedef typename simd::perm_simd_type  perm_simd_type;
   typedef typename simd::value_type      value_type;
 
-  Simd_unaligned_loader(value_type const* ptr) : ptr_unaligned_(ptr) 
+  Simd_unaligned_loader(value_type const* ptr)
   {
     ptr_aligned_    = (value_type*)((intptr_t)ptr & ~(simd::alignment-1));
 
     x0_  = simd::load((value_type*)ptr_aligned_);
-    x1_  = simd::load((value_type*)(ptr_aligned_+simd::vec_size));
-    sh_  = simd::shift_for_addr(ptr_unaligned_);
+    sh_  = simd::shift_for_addr(ptr);
   }
 
   simd_type load() const
-  { return simd::perm(x0_, x1_, sh_); }
+  {
+    x1_  = simd::load((value_type*)(ptr_aligned_+simd::vec_size));
+    return simd::perm(x0_, x1_, sh_);
+  }
 
   void increment(length_type n = 1)
   {
-    ptr_unaligned_ += n * simd::vec_size;
     ptr_aligned_   += n * simd::vec_size;
   
     // update x0
     x0_ = (n == 1) ? x1_ : simd::load((value_type*)ptr_aligned_);
-
-    // update x1
-    x1_ = simd::load((value_type*)(ptr_aligned_+simd::vec_size));
   }
 
-  value_type const*            ptr_unaligned_;
   value_type const*            ptr_aligned_;
   simd_type                    x0_;
-  simd_type                    x1_;
+  mutable simd_type            x1_;
   perm_simd_type               sh_;
 
 };
@@ -568,7 +565,7 @@
   AB const &left() const { return left_;}
   C const &right() const { return right_;}
 
-  simd_type load() const 
+  simd_type load() const
   {
     simd_type a = left_.left().load();
     simd_type b = left_.right().load();
Index: src/vsip/opt/expr/serial_dispatch.hpp
===================================================================
--- src/vsip/opt/expr/serial_dispatch.hpp	(revision 192398)
+++ src/vsip/opt/expr/serial_dispatch.hpp	(working copy)
@@ -47,6 +47,9 @@
 #ifdef VSIP_IMPL_HAVE_SIMD_LOOP_FUSION
 #  include <vsip/opt/simd/expr_evaluator.hpp>
 #endif
+#ifdef VSIP_IMPL_HAVE_SIMD_UNALIGNED_LOOP_FUSION
+#  include <vsip/opt/simd/eval_unaligned.hpp>
+#endif
 #ifdef VSIP_IMPL_HAVE_SIMD_GENERIC
 #  include <vsip/opt/simd/eval_generic.hpp>
 #endif
Index: ChangeLog
===================================================================
--- ChangeLog	(revision 194489)
+++ ChangeLog	(working copy)
@@ -1,3 +1,20 @@
+2008-02-26  Jules Bergmann  <jules@xxxxxxxxxxxxxxxx>
+
+	* src/vsip/opt/simd/expr_evaluator.hpp
+	* src/vsip/opt/simd/proxy_factory.hpp: New file, Proxy_factor from
+	  expr_evaluator.hpp.
+	* src/vsip/opt/simd/eval_unaligned.hpp: New file, unaligned SIMD
+	  loop-fusion evaluator from expr_evaluator.hpp.
+	* src/vsip/opt/simd/expr_iterator.hpp (Simd_unaligned_loader): Move
+	  loads around to avoid second load past end of vector
+	  (first load inevitable).
+	* src/vsip/opt/expr/serial_dispatch.hpp: Include eval_unaligned.hpp.
+	* configure.ac (--enable-simd-unaligned-loop-fusion): Allow SIMD
+	  unaligned loop fusion to be controlled independently of aligned
+	  loop fusion.
+	* doc/quickstart/quickstart.xml: Document --enable-simd-loop-fusion
+	  and --enable-simd-unaligned-loop-fusion.
+
 2008-02-25  Jules Bergmann  <jules@xxxxxxxxxxxxxxxx>
 
 	* src/vsip/opt/simd/rscvmul.hpp: Fix bug in handling unalignment.
Index: configure.ac
===================================================================
--- configure.ac	(revision 192398)
+++ configure.ac	(working copy)
@@ -303,9 +303,15 @@
 
 AC_ARG_ENABLE([simd_loop_fusion],
   AS_HELP_STRING([--enable-simd-loop-fusion],
-                 [Enable SIMD loop-fusion.]),,
+                 [Enable SIMD loop-fusion (Disable by default).]),,
   [enable_simd_loop_fusion=no])
 
+AC_ARG_ENABLE([simd_unaligned_loop_fusion],
+  AS_HELP_STRING([--enable-simd-unaligned-loop-fusion],
+                 [Enable SIMD loop-fusion for unaligned expressions
+                  (Follows --enable-simd-loop-fusion by default).]),,
+  [enable_simd_unaligned_loop_fusion=default])
+
 AC_ARG_WITH([builtin_simd_routines],
   AS_HELP_STRING([--with-builtin-simd-routines=WHAT],
                  [Use builtin SIMD routines.]),,
@@ -856,13 +862,22 @@
 #
 # Configure use of SIMD loop-fusion
 #
+if test "$enable_simd_unaligned_loop_fusion" = "default"; then
+  enable_simd_unaligned_loop_fusion=$enable_simd_loop_fusion
+fi
+
 if test "$enable_simd_loop_fusion" = "yes"; then
   AC_DEFINE_UNQUOTED(VSIP_IMPL_HAVE_SIMD_LOOP_FUSION, 1,
     [Define whether to use SIMD loop-fusion in expr dispatch.])
 fi
 
+if test "$enable_simd_unaligned_loop_fusion" = "yes"; then
+  AC_DEFINE_UNQUOTED(VSIP_IMPL_HAVE_SIMD_UNALIGNED_LOOP_FUSION, 1,
+    [Define whether to use SIMD unaligned loop-fusion in expr dispatch.])
+fi
 
 
+
 #
 # Configure use of builtin SIMD routines
 #
@@ -1055,6 +1070,8 @@
 else
   AC_MSG_RESULT([Complex storage format:                  interleaved])
 fi
+AC_MSG_RESULT([Using SIMD aligned loop-fusion           ${enable_simd_loop_fusion}])
+AC_MSG_RESULT([Using SIMD unaligned loop-fusion         ${enable_simd_unaligned_loop_fusion}])
 AC_MSG_RESULT([Timer:                                   ${enable_timer}])
 AC_MSG_RESULT([With Python bindings:                    ${enable_scripting}])
 
Index: doc/quickstart/quickstart.xml
===================================================================
--- doc/quickstart/quickstart.xml	(revision 192398)
+++ doc/quickstart/quickstart.xml	(working copy)
@@ -1215,6 +1215,40 @@
      </varlistentry>
 
      <varlistentry>
+      <term><option>--enable-simd-loop-fusion</option></term>
+      <listitem>
+       <para>
+        Enable VSIPL++ to generate SIMD instructions for loop-fusion
+        expressions (containing data that is SIMD aligned).
+
+        This option is useful for increasing performance of many
+        VSIPL++ expressions on platforms with SIMD instruction
+        set extensions (such as Intel SSE, or Power VMX/AltiVec).
+
+        The default is not to generate SIMD instructions.
+       </para>
+      </listitem>
+     </varlistentry>
+
+     <varlistentry>
+      <term><option>--enable-simd-unaligned-loop-fusion</option></term>
+      <listitem>
+       <para>
+        Enable VSIPL++ to generate SIMD instructions for loop-fusion
+        expressions, possibly containing data that is SIMD unaligned.
+
+        This option is useful for increasing performance of VSIPL++
+        expressions that work with unaligned data on platforms with
+        SIMD instruction set extensions (such as Intel SSE, or Power
+        VMX/AltiVec).
+
+        The default is to follow the setting of
+        <option>--enable-simd-loop-fusion</option>.
+       </para>
+      </listitem>
+     </varlistentry>
+
+     <varlistentry>
       <term><option>--with-complex=<replaceable>format</replaceable></option></term>
       <listitem>
        <para>