Actions

icon Post
text/html Subscribe
text/html Unsubscribe

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

SIMD all unaligned dispatch


  • To: VSIPL++ Developers List <vsipl++@xxxxxxxxxxxxxxxx>
  • Subject: SIMD all unaligned dispatch
  • From: Assem Salama <assem@xxxxxxxxxxxxxxxx>
  • Date: Mon, 18 Jun 2007 12:03:46 -0400

Everyone,
This patch includes some missing pieces not included in previous patch. This should make a fresh checkout compile ok :) I apologize for last patch's incompleteness.

Thanks,
Assem
Index: src/vsip/core/type_list.hpp
===================================================================
--- src/vsip/core/type_list.hpp	(revision 174145)
+++ src/vsip/core/type_list.hpp	(working copy)
@@ -35,12 +35,14 @@
 	  typename T9 = None_type,
 	  typename T10 = None_type,
 	  typename T11 = None_type,
-	  typename T12 = None_type>
+	  typename T12 = None_type,
+	  typename T13 = None_type,
+	  typename T14 = None_type>
 struct Make_type_list
 {
 private:
   typedef typename 
-    Make_type_list<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12>::type Rest;
+    Make_type_list<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14>::type Rest;
 public:
   typedef Type_list<T1, Rest> type;
 };
Index: src/vsip/core/impl_tags.hpp
===================================================================
--- src/vsip/core/impl_tags.hpp	(revision 174145)
+++ src/vsip/core/impl_tags.hpp	(working copy)
@@ -41,6 +41,7 @@
 struct Copy_tag {};		// Optimized Copy
 struct Op_expr_tag {};		// Special expr handling (vmmul, etc)
 struct Simd_loop_fusion_tag {};	// SIMD Loop Fusion.
+struct Simd_all_unaligned_loop_fusion_tag {};// SIMD all unaligned Loop Fusion.
 struct Simd_unaligned_loop_fusion_tag {};
 struct Fc_expr_tag {};		// Fused Fastconv RBO evaluator.
 struct Rbo_expr_tag {};		// Return-block expression evaluator.
Index: src/vsip/opt/simd/simd.hpp
===================================================================
--- src/vsip/opt/simd/simd.hpp	(revision 174261)
+++ src/vsip/opt/simd/simd.hpp	(working copy)
@@ -1137,6 +1137,26 @@
     return t;
   }
 
+  static simd_type load_unaligned(value_type const* addr)
+  {
+    base_simd_type v0=Simd_traits<T>::load_unaligned(((T const*)addr)+0);
+    base_simd_type v1=Simd_traits<T>::load_unaligned(((T const*)addr)+vec_size);
+#if __ghs__
+    simd_type t;
+    t.r = Simd_traits<T>::real_from_interleaved(v0, v1);
+    t.i = Simd_traits<T>::imag_from_interleaved(v0, v1);
+#else
+    // 070509: This causes an internal error with GHS:
+    //         "type-change_constant: integer to bad type"
+    simd_type t = 
+      {
+	Simd_traits<T>::real_from_interleaved(v0, v1),
+	Simd_traits<T>::imag_from_interleaved(v0, v1)
+      };
+#endif
+    return t;
+  }
+
   static simd_type load(value_type const* addr)
   {
     base_simd_type v0 = Simd_traits<T>::load(((T const*)addr)+0);
Index: src/vsip/opt/simd/expr_evaluator.hpp
===================================================================
--- src/vsip/opt/simd/expr_evaluator.hpp	(revision 174261)
+++ src/vsip/opt/simd/expr_evaluator.hpp	(working copy)
@@ -56,14 +56,22 @@
     !Is_split_block<BlockT>::value;
 
   static bool 
-  rt_valid(BlockT const &b)
+  rt_valid(BlockT const &b, int alignment)
   {
     Ext_data<BlockT, layout_type> dda(b, SYNC_IN);
     return dda.stride(0) == 1 && 
       (!A ||
-       !Simd_traits<typename BlockT::value_type>::alignment_of(dda.data()));
+       Simd_traits<typename BlockT::value_type>::alignment_of(dda.data()) ==
+       alignment);
   }
 
+  static int
+  alignment(BlockT const &b)
+  {
+    Ext_data<BlockT, layout_type> dda(b, SYNC_IN);
+    return Simd_traits<typename BlockT::value_type>::alignment_of(dda.data());
+  }
+
   static proxy_type
   create(BlockT const &b) 
   {
@@ -80,7 +88,7 @@
   static bool const ct_valid = true;
 
   static bool 
-  rt_valid(Scalar_block<1, T> const &) {return true;}
+  rt_valid(Scalar_block<1, T> const &, int) {return true;}
 
   static proxy_type
   create(Scalar_block<1, T> const &b) 
@@ -103,9 +111,9 @@
   static bool const ct_valid = Unary_operator_map<T, O>::is_supported;
 
   static bool 
-  rt_valid(Unary_expr_block<D, O, B, T> const &b)
+  rt_valid(Unary_expr_block<D, O, B, T> const &b, int alignment)
   {
-    return Proxy_factory<B, A>::rt_valid(b.op());
+    return Proxy_factory<B, A>::rt_valid(b.op(), alignment);
   }
 
   static proxy_type
@@ -129,9 +137,9 @@
 
 
   static bool 
-  rt_valid(Unary_expr_block<D, unaligned_functor, B, T> const &b)
+  rt_valid(Unary_expr_block<D, unaligned_functor, B, T> const &b, int alignment)
   {
-    return Proxy_factory<B, false>::rt_valid(b.op());
+    return Proxy_factory<B, false>::rt_valid(b.op(), alignment);
   }
 
   static proxy_type
@@ -140,6 +148,7 @@
     return proxy_type(Proxy_factory<B, false>::create(b.op()));
   }
 };
+
 template <dimension_type                D,
 	  template <typename, typename> class O,
 	  typename                      LB,
@@ -163,10 +172,10 @@
     Proxy_factory<RB, A>::ct_valid;
 
   static bool 
-  rt_valid(Binary_expr_block<D, O, LB, LT, RB, RT> const &b)
+  rt_valid(Binary_expr_block<D, O, LB, LT, RB, RT> const &b, int alignment)
   {
-    return Proxy_factory<LB, A>::rt_valid(b.left()) &&
-      Proxy_factory<RB, A>::rt_valid(b.right());
+    return Proxy_factory<LB, A>::rt_valid(b.left(), alignment) &&
+           Proxy_factory<RB, A>::rt_valid(b.right(), alignment);
   }
 
   static proxy_type
@@ -207,11 +216,11 @@
     Proxy_factory<Block3, A>::ct_valid;
 
   static bool 
-  rt_valid(SrcBlock const &b)
+  rt_valid(SrcBlock const &b, int alignment)
   {
-    return Proxy_factory<Block1, A>::rt_valid(b.first()) &&
-           Proxy_factory<Block2, A>::rt_valid(b.second()) &&
-           Proxy_factory<Block3, A>::rt_valid(b.third());
+    return Proxy_factory<Block1, A>::rt_valid(b.first(), alignment) &&
+           Proxy_factory<Block2, A>::rt_valid(b.second(), alignment) &&
+           Proxy_factory<Block3, A>::rt_valid(b.third(), alignment);
   }
 
   static proxy_type
@@ -228,6 +237,7 @@
   }
 };
 
+
 } // namespace vsip::impl::simd
 
 
@@ -264,7 +274,7 @@
     return (dda.stride(0) == 1 &&
 	    simd::Simd_traits<typename LB::value_type>::
 	      alignment_of(dda.data()) == 0 &&
-	    simd::Proxy_factory<RB, true>::rt_valid(rhs));
+	    simd::Proxy_factory<RB, true>::rt_valid(rhs, 0));
   }
 
   static void exec(LB& lhs, RB const& rhs)
@@ -331,7 +341,86 @@
   }
 };
 
+// This evaluator is for operations where all vectors are unaligned
+// Look at Simd_unaligned_loop_fusion_tag for mixed unaligned data.
+template <typename LB,
+	  typename RB>
+struct Serial_expr_evaluator<1, LB, RB, Simd_all_unaligned_loop_fusion_tag>
+{
+  typedef typename Adjust_layout_dim<
+                     1, typename Block_layout<LB>::layout_type>::type
+		layout_type;
 
+  static char const* name() { return "Expr_SIMD_All_Unaligned_Loop"; }
+  
+  static bool const ct_valid =
+    // Is SIMD supported at all ?
+    simd::Simd_traits<typename LB::value_type>::is_accel &&
+    // Check that direct access is possible.
+    Ext_data_cost<LB>::value == 0 &&
+    simd::Proxy_factory<RB, true>::ct_valid &&
+    // Only allow float, double, complex<float>, and complex<double> at this time.
+    (Type_equal<typename Scalar_of<typename LB::value_type>::type, float>::value ||
+     Type_equal<typename Scalar_of<typename LB::value_type>::type, double>::value) &&
+    // Make sure both sides have the same type.
+    Type_equal<typename LB::value_type, typename RB::value_type>::value &&
+    // Make sure the left side is not a complex split block.
+    !Is_split_block<LB>::value;
+
+
+  static bool rt_valid(LB& lhs, RB const& rhs)
+  {
+    Ext_data<LB, layout_type> dda(lhs, SYNC_OUT);
+    int lhs_a = simd::Proxy_factory<LB,       true>::alignment(lhs);
+    return (dda.stride(0) == 1 &&
+            simd::Proxy_factory<RB, true>::rt_valid(rhs, lhs_a));
+    
+    
+  }
+
+  static void exec(LB& lhs, RB const& rhs)
+  {
+    typedef typename simd::LValue_access_traits<typename LB::value_type> WAT;
+    typedef typename simd::Proxy_factory<RB, true>::access_traits EAT;
+    typedef typename simd::Proxy_factory<RB, true>::proxy_type proxy_type;
+
+    length_type const vec_size =
+      simd::Simd_traits<typename LB::value_type>::vec_size;
+    Ext_data<LB, layout_type> dda(lhs, SYNC_OUT);
+
+    simd::Proxy<WAT,true> lp(dda.data());
+    proxy_type rp(simd::Proxy_factory<RB,true>::create(rhs));
+
+    length_type const size = dda.size(0);
+    length_type n = size;
+
+    // loop using proxy interface. This generates the best code
+    // with gcc 3.4 (with gcc 4.1 the difference to the first case
+    // above is negligible).
+
+    // First, deal with unaligned pointers
+    typename Ext_data<LB, layout_type>::raw_ptr_type  raw_ptr = dda.data();
+    while(simd::Simd_traits<typename LB::value_type>::alignment_of(raw_ptr) &&
+          n > 0)
+    {
+      lhs.put(size-n, rhs.get(size-n));
+      n--;
+      raw_ptr++;
+    }
+
+    while (n >= vec_size)
+    {
+      lp.store(rp.load());
+      n -= vec_size;
+      lp.increment();
+      rp.increment();
+    }
+
+    // Process the remainder, using simple loop fusion.
+    for (index_type i = size - n; i != size; ++i) lhs.put(i, rhs.get(i));
+  }
+};
+
 // This evaluator is for unaligned data. Any time any of the blocks are
 // unaligned, we use this evalutator. Basically, in the evaluator list, this
 // evaluator is right after the aligned evaluator and rt_valid determines
@@ -368,7 +457,7 @@
     return (dda.stride(0) == 1 &&
 	    simd::Simd_traits<typename LB::value_type>::
 	      alignment_of(dda.data()) == 0 &&
-	    simd::Proxy_factory<RB, false>::rt_valid(rhs));
+	    simd::Proxy_factory<RB, false>::rt_valid(rhs, 0));
   }
 
   static void exec(LB& lhs, RB const& rhs)
Index: src/vsip/opt/simd/eval_generic.hpp
===================================================================
--- src/vsip/opt/simd/eval_generic.hpp	(revision 174261)
+++ src/vsip/opt/simd/eval_generic.hpp	(working copy)
@@ -664,6 +664,8 @@
   
   static bool rt_valid(DstBlock& dst, SrcBlock const& src)
   {
+    typedef simd::Simd_traits<typename SrcBlock::value_type> simd;
+
     // check if all data is unit stride
     Ext_data<DstBlock, dst_lp>     ext_dst(dst,              SYNC_OUT);
     Ext_data<Block1,   a_lp>       ext_a(src.first().left(), SYNC_IN);
@@ -672,7 +674,11 @@
            ext_a.stride(0) == 1 &&
 	   ext_b.stride(0) == 1 &&
 	   // make sure (A op B, A, k)
-	   (&(src.first().left()) == &(src.second())));
+	   (&(src.first().left()) == &(src.second())) &&
+	   // make sure everyting is aligned!
+	   !simd::alignment_of(ext_dst.data()) &&
+	   !simd::alignment_of(ext_a.data()) &&
+	   !simd::alignment_of(ext_b.data()));
   }
 
   static void exec(DstBlock& dst, SrcBlock const& src)
Index: src/vsip/opt/simd/expr_iterator.hpp
===================================================================
--- src/vsip/opt/simd/expr_iterator.hpp	(revision 174261)
+++ src/vsip/opt/simd/expr_iterator.hpp	(working copy)
@@ -268,13 +268,14 @@
   simd_type load() const
   { return simd::perm(x0_, x1_, sh_); }
 
-  void increment(length_type n = 1)
+  //void increment(length_type n = 1)
+  void increment()
   {
-    ptr_unaligned_ += n * Simd_traits<value_type>::vec_size;
-    ptr_aligned_   += n;
+    ptr_unaligned_ += Simd_traits<value_type>::vec_size;
+    ptr_aligned_++;
   
     // update x0
-    x0_ = (n == 1)? x1_:simd::load((value_type*)ptr_aligned_);
+    x0_ = x1_;
 
     // update x1
     x1_ = simd::load((value_type*)(ptr_aligned_+simd::vec_size));
@@ -299,8 +300,9 @@
 
   simd_type load() const { return simd::load_unaligned(ptr_unaligned_); }
 
-  void increment(length_type n = 1)
-  { ptr_unaligned_ += n * Simd_traits<value_type>::vec_size; }
+  //void increment(length_type n = 1)
+  void increment()
+  { ptr_unaligned_ += Simd_traits<value_type>::vec_size; }
 
   value_type const*            ptr_unaligned_;
 };
@@ -316,12 +318,18 @@
   typedef T value_type;
   typedef typename Simd_traits<value_type>::simd_type simd_type;
 
-  Proxy(value_type const *ptr) : ptr_(ptr) {}
+  Proxy(value_type const *ptr) : ptr_(ptr)
+  {
+    // Force alignment of pointer.
+    intptr_t int_ptr = (intptr_t)ptr_;
+    int_ptr &= ~(Simd_traits<value_type>::alignment-1);
+    ptr_ = (value_type*) int_ptr;
+  }
 
   simd_type load() const { return Simd_traits<value_type>::load(ptr_);}
 
   void increment(length_type n = 1)
-  { ptr_ += n * Simd_traits<value_type>::vec_size;}
+  { ptr_ += Simd_traits<value_type>::vec_size;}
 
 private:
   value_type const *ptr_;
@@ -341,7 +349,7 @@
   { return simd_loader_.load(); }
 
   void increment(length_type n = 1) 
-  { simd_loader_.increment(n); }
+  { simd_loader_.increment(); }
 
 private:
   Simd_unaligned_loader<T>      simd_loader_;
@@ -357,7 +365,14 @@
   typedef T value_type;
   typedef typename Simd_traits<value_type>::simd_type simd_type;
 
-  Proxy(value_type *ptr) : ptr_(ptr) {}
+  Proxy(value_type *ptr) : ptr_(ptr)
+  {
+    // Force alignment of pointer.
+    intptr_t int_ptr = (intptr_t)ptr_;
+    int_ptr &= ~(Simd_traits<value_type>::alignment-1);
+    ptr_ = (value_type*) int_ptr;
+  }
+
   template <typename T1>
   Proxy operator = (Proxy<T1,IsAligned> const &o) 
   {
Index: src/vsip/opt/expr/serial_dispatch_fwd.hpp
===================================================================
--- src/vsip/opt/expr/serial_dispatch_fwd.hpp	(revision 174145)
+++ src/vsip/opt/expr/serial_dispatch_fwd.hpp	(working copy)
@@ -50,6 +50,8 @@
 		       Copy_tag,
 		       Op_expr_tag,
 		       Simd_loop_fusion_tag,
+		       Simd_all_unaligned_loop_fusion_tag,
+		       Simd_unaligned_loop_fusion_tag,
 		       Fc_expr_tag,
 		       Rbo_expr_tag,
 		       Loop_fusion_tag>::type LibraryTagList;