Actions

icon Post
text/html Subscribe
text/html Unsubscribe

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[patch] BLAS dispatch


  • To: VSIPL++ Developers List <vsipl++@xxxxxxxxxxxxxxxx>
  • Subject: [patch] BLAS dispatch
  • From: Don McCoy <don@xxxxxxxxxxxxxxxx>
  • Date: Fri, 04 Nov 2005 12:36:22 -0700

The attached patch adds dispatch support for certain BLAS functions. Two things that are worth drawing attention to are: 1) The row-major cases for outer() with complex values and 2) The various run-time and compile-time checks used in the blas evaluator functions.

For 1), my concern is that the BLAS 'ger' variant used can only conjugate the second vector argument. I'm using the non-conj version and performing the conjugation on the first vector argument manually. It involves memory allocation and an extra loop through one of the vectors. I'd like to know if there is a more efficient way to do this.

For 2), just want to make sure I didn't omit any checks that would dispatch a call to BLAS that it cannot handle. I was careful to verify that BLAS was only called when it should be, but it would be easy to overlook something if there is not a corresponding test case for it. In cases like outer, it is not tested with a column-major result matrix if only the vsip::outer() is called (because it allocates the matrix with the default block). So, for the test, I added the col-major cases explicitly by calling vsip::impl::outer() directly. There may be other cases where we should add specific tests for col-major layouts.

Regards,

--
Don McCoy
CodeSourcery, LLC




2005-11-04  Don McCoy  <don@xxxxxxxxxxxxxxxx>

	* tests/matvec-prod.cpp: Re-arranged tests to avoid running tests
	  repeatedly with the same ordering.  Added tests for vector-matrix
	  and matrix-vector products.
	* tests/matvec.cpp: added test for outer().
	* tests/ref_matvec.hpp: modified ref::outer() to conjugate complex
	  values.  Added vector-vector product to use for matrix-matrix
	  product.  Added v-m and m-v products as well.
	* src/vsip/impl/eval-blas.hpp: Added evaluators for BLAS outer,
	  m-v prod, v-m prod and general matrix multiply (gemm).  Fixed
	  a bug in the runtime check for m-m prod that only affected
	  col-major cases.
	* src/vsip/impl/general_dispatch.hpp: Added operation tags for
	  m-v and v-m products.  New implementation tag for SAL.  New
	  wrapper classes for operand lists of 3 and 4 arguments along
	  with the corresponding dispatch classes.
	* src/vsip/impl/lapack.hpp: Included prototypes for gemv and ger
	  BLAS functions with overloaded wrappers for calling them.
	* src/vsip/impl/matvec-prod.hpp: Added generic evaluators for
	  m-v and v-m products.  Added dispatch functions for same.
	* src/vsip/impl/matvec.hpp: Same as above for outer and gemp.
Index: tests/matvec-prod.cpp
===================================================================
RCS file: /home/cvs/Repository/vpp/tests/matvec-prod.cpp,v
retrieving revision 1.3
diff -c -p -r1.3 matvec-prod.cpp
*** tests/matvec-prod.cpp	12 Oct 2005 12:45:05 -0000	1.3
--- tests/matvec-prod.cpp	4 Nov 2005 19:32:25 -0000
*************** check_prod(
*** 68,73 ****
--- 68,106 ----
  }
  
  
+ template <typename T0,
+ 	  typename T1,
+           typename T2,
+           typename Block0,
+           typename Block1,
+           typename Block2>
+ void
+ check_prod(
+   Vector<T0, Block0> test,
+   Vector<T1, Block1> chk,
+   Vector<T2, Block2> gauge)
+ {
+   typedef typename Promotion<T0, T1>::type return_type;
+   typedef typename vsip::impl::Scalar_of<return_type>::type scalar_type;
+ 
+   Index<1> idx;
+   scalar_type err = maxval(((mag(chk - test)
+ 			     / Precision_traits<scalar_type>::eps)
+ 			    / gauge),
+ 			   idx);
+ 
+ #if VERBOSE
+   cout << "test  =\n" << test;
+   cout << "chk   =\n" << chk;
+   cout << "gauge =\n" << gauge;
+   cout << "err = " << err << endl;
+ #endif
+ 
+   assert(err < 10.0);
+ }
+ 
+ 
+ 
  /***********************************************************************
    Test Definitions
  ***********************************************************************/
*************** test_prod_rand(length_type m, length_typ
*** 120,125 ****
--- 153,160 ----
  	gauge(i, j) = scalar_type(1);
  
  #if VERBOSE
+   cout << "[" << m << "x" << n << "]  [" << 
+     n << "x" << k << "]  [" << m << "x" << k << "]  "  << endl;
    cout << "a     =\n" << a;
    cout << "b     =\n" << b;
  #endif
*************** test_prod_rand(length_type m, length_typ
*** 130,135 ****
--- 165,275 ----
  }
  
  
+ template <typename T0,
+ 	  typename T1>
+ void
+ test_prod_mv(length_type m, length_type n)
+ {
+   typedef typename Promotion<T0, T1>::type return_type;
+   typedef typename vsip::impl::Scalar_of<return_type>::type scalar_type;
+ 
+   Matrix<T0> a(m, n, T0());
+   Vector<T1> b1(n, T1());
+   Vector<T1> b2(m, T1());
+ 
+   randm(a);
+   randv(b1);
+   randv(b2);
+ 
+ #if VERBOSE
+   cout << "[" << m << "x" << n << "]"  << endl;
+   cout << "a     =\n" << a;
+   cout << "b1    =\n" << b1;
+   cout << "b2    =\n" << b2;
+ #endif
+ 
+   Vector<return_type> r1(m);
+   Vector<return_type> chk1(m);
+   Vector<scalar_type> gauge1(m);
+ 
+   r1 = prod( a, b1 );
+   chk1 = ref::prod( a, b1 );
+   gauge1 = ref::prod( mag(a), mag(b1) );
+ 
+   for (index_type i=0; i<gauge1.size(0); ++i)
+     if (!(gauge1(i) > scalar_type()))
+       gauge1(i) = scalar_type(1);
+ 
+   check_prod( r1, chk1, gauge1 );
+ 
+   Vector<return_type> r2(n);
+   Vector<return_type> chk2(n);
+   Vector<scalar_type> gauge2(n);
+ 
+   r2 = prod( trans(a), b2 );
+   chk2 = ref::prod( trans(a), b2 );
+   gauge2 = ref::prod( mag(trans(a)), mag(b2) );
+ 
+   for (index_type i=0; i<gauge2.size(0); ++i)
+     if (!(gauge2(i) > scalar_type()))
+       gauge2(i) = scalar_type(1);
+ 
+   check_prod( r2, chk2, gauge2 );
+ }
+ 
+ template <typename T0,
+ 	  typename T1>
+ void
+ test_prod_vm(length_type m, length_type n)
+ {
+   typedef typename Promotion<T0, T1>::type return_type;
+   typedef typename vsip::impl::Scalar_of<return_type>::type scalar_type;
+ 
+   Vector<T1> a1(m, T1());
+   Vector<T1> a2(n, T1());
+   Matrix<T0> b(m, n, T0());
+ 
+   randv(a1);
+   randv(a2);
+   randm(b);
+ 
+ #if VERBOSE
+   cout << "[" << m << "x" << n << "]"  << endl;
+   cout << "a1    =\n" << a1;
+   cout << "a2    =\n" << a2;
+   cout << "b     =\n" << b;
+ #endif
+ 
+   Vector<return_type> r1(n);
+   Vector<return_type> chk1(n);
+   Vector<scalar_type> gauge1(n);
+ 
+   r1 = prod( a1, b );
+   chk1 = ref::prod( a1, b );
+   gauge1 = ref::prod( mag(a1), mag(b) );
+ 
+   for (index_type i=0; i<gauge1.size(0); ++i)
+     if (!(gauge1(i) > scalar_type()))
+       gauge1(i) = scalar_type(1);
+ 
+   check_prod( r1, chk1, gauge1 );
+ 
+   Vector<return_type> r2(m);
+   Vector<return_type> chk2(m);
+   Vector<scalar_type> gauge2(m);
+ 
+   r2 = prod( a2, trans(b) );
+   chk2 = ref::prod( a2, trans(b) );
+   gauge2 = ref::prod( mag(a2), mag(trans(b)) );
+ 
+   for (index_type i=0; i<gauge2.size(0); ++i)
+     if (!(gauge2(i) > scalar_type()))
+       gauge2(i) = scalar_type(1);
+ 
+   check_prod( r2, chk2, gauge2 );
+ }
+ 
+ 
  
  template <typename T0,
  	  typename T1>
*************** test_prod3_rand()
*** 170,175 ****
--- 310,318 ----
  #if VERBOSE
    cout << "a     =\n" << a;
    cout << "b     =\n" << b;
+   cout << "chk   =\n" << chk;
+   cout << "res1  =\n" << res1;
+   cout << "res2  =\n" << res2;
  #endif
  
    check_prod( res1, chk, gauge );
*************** template <typename T0,
*** 344,363 ****
  	  typename Order0,
  	  typename Order1>
  void
! prod_cases()
  {
    test_prod_rand<T0, T1, OrderR, Order0, Order1>(5, 5, 5);
    test_prod_rand<T0, T1, OrderR, Order0, Order1>(5, 7, 9);
    test_prod_rand<T0, T1, OrderR, Order0, Order1>(9, 5, 7);
    test_prod_rand<T0, T1, OrderR, Order0, Order1>(9, 7, 5);
  
    test_prod3_rand<T0, T1>();
    test_prod4_rand<T0, T1>();
- 
    test_prodt_rand<T0, T1>(5, 5, 5);
    test_prodt_rand<T0, T1>(5, 7, 9);
    test_prodt_rand<T0, T1>(9, 5, 7);
    test_prodt_rand<T0, T1>(9, 7, 5);
  }
  
  
--- 487,531 ----
  	  typename Order0,
  	  typename Order1>
  void
! prod_types_with_order()
  {
    test_prod_rand<T0, T1, OrderR, Order0, Order1>(5, 5, 5);
    test_prod_rand<T0, T1, OrderR, Order0, Order1>(5, 7, 9);
    test_prod_rand<T0, T1, OrderR, Order0, Order1>(9, 5, 7);
    test_prod_rand<T0, T1, OrderR, Order0, Order1>(9, 7, 5);
+ }
+ 
+ 
+ template <typename T0,
+ 	  typename T1>
+ void
+ prod_cases_with_order()
+ {
+   prod_types_with_order<T0, T1, row2_type, row2_type, row2_type>();
+   prod_types_with_order<T0, T1, row2_type, col2_type, row2_type>();
+   prod_types_with_order<T0, T1, row2_type, row2_type, col2_type>();
+   prod_types_with_order<T0, T1, col2_type, col2_type, col2_type>();
+   prod_types_with_order<T0, T1, col2_type, row2_type, col2_type>();
+   prod_types_with_order<T0, T1, col2_type, col2_type, row2_type>();
+ }
+ 
+ 
+ template <typename T0,
+ 	  typename T1>
+ void
+ prod_cases()
+ {
+   test_prod_mv<T0, T1>(5, 7);
+   test_prod_vm<T0, T1>(5, 7);
  
    test_prod3_rand<T0, T1>();
    test_prod4_rand<T0, T1>();
    test_prodt_rand<T0, T1>(5, 5, 5);
    test_prodt_rand<T0, T1>(5, 7, 9);
    test_prodt_rand<T0, T1>(9, 5, 7);
    test_prodt_rand<T0, T1>(9, 7, 5);
+ 
+   prod_cases_with_order<T0, T1>();
  }
  
  
*************** prod_cases_complex_only()
*** 379,403 ****
  
  
  
- template <typename OrderR,
- 	  typename Order0,
- 	  typename Order1>
- void
- prod_types()
- {
-   prod_cases<float,  float,  OrderR, Order0, Order1>();
-   prod_cases<double, double, OrderR, Order0, Order1>();
-   prod_cases<float,  double, OrderR, Order0, Order1>();
-   prod_cases<double, float,  OrderR, Order0, Order1>();
- 
-   prod_cases<complex<float>, complex<float>, OrderR, Order0, Order1 >();
-   prod_cases<float,          complex<float>, OrderR, Order0, Order1 >();
-   prod_cases<complex<float>, float,          OrderR, Order0, Order1 >();
- 
-   prod_cases_complex_only<complex<float>, complex<float> >();
- }
- 
- 
  
  /***********************************************************************
    Main
--- 547,552 ----
*************** main(int argc, char** argv)
*** 414,424 ****
    Precision_traits<float>::compute_eps();
    Precision_traits<double>::compute_eps();
  
!   prod_types<row2_type, row2_type, row2_type>();
!   prod_types<row2_type, col2_type, row2_type>();
!   prod_types<row2_type, row2_type, col2_type>();
! 
!   prod_types<col2_type, col2_type, col2_type>();
!   prod_types<col2_type, row2_type, col2_type>();
!   prod_types<col2_type, col2_type, row2_type>();
  }
--- 563,575 ----
    Precision_traits<float>::compute_eps();
    Precision_traits<double>::compute_eps();
  
!   prod_cases<float,  float>();
!   prod_cases<double, double>();
!   prod_cases<float,  double>();
!   prod_cases<double, float>();
!   prod_cases<complex<float>, complex<float> >();
!   prod_cases<float,          complex<float> >();
!   prod_cases<complex<float>, float          >();
! 
!   prod_cases_complex_only<complex<float>, complex<float> >();
  }
Index: tests/matvec.cpp
===================================================================
RCS file: /home/cvs/Repository/vpp/tests/matvec.cpp,v
retrieving revision 1.3
diff -c -p -r1.3 matvec.cpp
*** tests/matvec.cpp	5 Oct 2005 11:41:03 -0000	1.3
--- tests/matvec.cpp	4 Nov 2005 19:32:25 -0000
***************
*** 20,25 ****
--- 20,26 ----
  #include "test.hpp"
  #include "test-random.hpp"
  #include "output.hpp"
+ #include "ref_matvec.hpp"
  
  using namespace std;
  using namespace vsip;
*************** Test_cumsum()
*** 335,340 ****
--- 336,371 ----
  }  
  
  
+ template <typename T>
+ void
+ Test_outer( T alpha, const length_type m, const length_type n )
+ {
+   {
+     Vector<T> a(m, T());
+     Vector<T> b(n, T());
+     Matrix<T> r(m, n, T());
+     Matrix<T> c1(m, n, T());
+     Matrix<T, Dense<2, T, col2_type> > c2(m, n, T());
+ 
+     randv(a);
+     randv(b);
+ 
+     c1 = outer(alpha, a, b);
+     impl::outer(alpha, a, b, c2);
+     r = ref::outer(alpha * a, b);
+ 
+     for ( vsip::index_type i = 0; i < r.size(0); ++i )
+       for ( vsip::index_type j = 0; j < r.size(1); ++j )
+       {
+         assert( equal( r.get(i, j), c1.get(i, j) ) );
+         assert( equal( r.get(i, j), c2.get(i, j) ) );
+       }
+   }
+ }
+ 
+ 
+ 
+ 
  /***********************************************************************
    Main
  ***********************************************************************/
*************** main(int argc, char** argv)
*** 380,385 ****
--- 411,429 ----
    
    Test_cumsum();
  
+   Test_outer<float>( static_cast<float>(M_PI), 3, 3 );
+   Test_outer<float>( static_cast<float>(M_PI), 5, 7 );
+   Test_outer<float>( static_cast<float>(M_PI), 7, 5 );
+   Test_outer<double>( static_cast<double>(M_PI), 3, 3 );
+   Test_outer<double>( static_cast<double>(M_PI), 5, 7 );
+   Test_outer<double>( static_cast<double>(M_PI), 7, 5 );
+   Test_outer<complex<float> >( complex<float>(M_PI, M_E), 3, 3 );
+   Test_outer<complex<float> >( complex<float>(M_PI, M_E), 5, 7 );
+   Test_outer<complex<float> >( complex<float>(M_PI, M_E), 7, 5 );
+   Test_outer<complex<double> >( complex<double>(M_PI, M_E), 3, 3 );
+   Test_outer<complex<double> >( complex<double>(M_PI, M_E), 5, 7 );
+   Test_outer<complex<double> >( complex<double>(M_PI, M_E), 7, 5 );
+ 
    return 0;
  }
  
Index: tests/ref_matvec.hpp
===================================================================
RCS file: /home/cvs/Repository/vpp/tests/ref_matvec.hpp,v
retrieving revision 1.1
diff -c -p -r1.1 ref_matvec.hpp
*** tests/ref_matvec.hpp	12 Oct 2005 12:45:05 -0000	1.1
--- tests/ref_matvec.hpp	4 Nov 2005 19:32:25 -0000
*************** dot(
*** 52,58 ****
  
  
  
! // Reference outer-product function.
  
  template <typename T0,
  	  typename T1,
--- 52,58 ----
  
  
  
! // Reference outer-product functions.
  
  template <typename T0,
  	  typename T1,
*************** outer(
*** 75,83 ****
    return r;
  }
  
  
  
! // Reference matrix-matrix product function (using outer-product).
  
  template <typename T0,
  	  typename T1,
--- 75,129 ----
    return r;
  }
  
+ template <typename T0,
+ 	  typename T1,
+ 	  typename Block0,
+ 	  typename Block1>
+ vsip::Matrix<typename vsip::Promotion<std::complex<T0>, std::complex<T1> >::type>
+ outer(
+   vsip::const_Vector<std::complex<T0>, Block0> u,
+   vsip::const_Vector<std::complex<T1>, Block1> v)
+ {
+   typedef typename vsip::Promotion<std::complex<T0>, std::complex<T1> >::type return_type;
  
+   vsip::Matrix<return_type> r(u.size(), v.size());
  
!   for (vsip::index_type i=0; i<u.size(); ++i)
!     for (vsip::index_type j=0; j<v.size(); ++j)
!       // r(i, j) = u(i) * v(j);
!       r.put(i, j, u.get(i) * conj(v.get(j)));
! 
!   return r;
! }
! 
! 
! // Reference vector-vector product
! 
! template <typename T0,
! 	  typename T1,
! 	  typename Block0,
! 	  typename Block1>
! vsip::Matrix<typename vsip::Promotion<T0, T1>::type>
! vv_prod(
!   vsip::const_Vector<T0, Block0> u,
!   vsip::const_Vector<T1, Block1> v)
! {
!   typedef typename vsip::Promotion<T0, T1>::type return_type;
! 
!   vsip::Matrix<return_type> r(u.size(), v.size());
! 
!   for (vsip::index_type i=0; i<u.size(); ++i)
!     for (vsip::index_type j=0; j<v.size(); ++j)
!       // r(i, j) = u(i) * v(j);
!       r.put(i, j, u.get(i) * v.get(j));
! 
!   return r;
! }
! 
! 
! 
! 
! // Reference matrix-matrix product function (using vv-product).
  
  template <typename T0,
  	  typename T1,
*************** prod(
*** 95,101 ****
    vsip::Matrix<return_type> r(a.size(0), b.size(1), return_type());
  
    for (vsip::index_type k=0; k<a.size(1); ++k)
!     r += ref::outer(a.col(k), b.row(k));
  
    return r;
  }
--- 141,195 ----
    vsip::Matrix<return_type> r(a.size(0), b.size(1), return_type());
  
    for (vsip::index_type k=0; k<a.size(1); ++k)
!     r += ref::vv_prod(a.col(k), b.row(k));
! 
!   return r;
! }
! 
! 
! // Reference matrix-vector product function (using dot-product).
! 
! template <typename T0,
! 	  typename T1,
! 	  typename Block0,
! 	  typename Block1>
! vsip::Vector<typename vsip::Promotion<T0, T1>::type>
! prod(
!   vsip::const_Matrix<T0, Block0> a,
!   vsip::const_Vector<T1, Block1> b)
! {
!   typedef typename vsip::Promotion<T0, T1>::type return_type;
! 
!   assert(a.size(1) == b.size(0));
! 
!   vsip::Vector<return_type> r(a.size(0), return_type());
! 
!   for (vsip::index_type k=0; k<a.size(0); ++k)
!     r.put( k, ref::dot(a.row(k), b) );
! 
!   return r;
! }
! 
! 
! // Reference vector-matrix product function (using dot-product).
! 
! template <typename T0,
! 	  typename T1,
! 	  typename Block0,
! 	  typename Block1>
! vsip::Vector<typename vsip::Promotion<T0, T1>::type>
! prod(
!   vsip::const_Vector<T1, Block1> a,
!   vsip::const_Matrix<T0, Block0> b)
! {
!   typedef typename vsip::Promotion<T0, T1>::type return_type;
! 
!   assert(a.size(0) == b.size(0));
! 
!   vsip::Vector<return_type> r(b.size(1), return_type());
! 
!   for (vsip::index_type k=0; k<b.size(1); ++k)
!     r.put( k, ref::dot(a, b.col(k)) );
  
    return r;
  }

Index: src/vsip/impl/eval-blas.hpp
===================================================================
RCS file: /home/cvs/Repository/vpp/src/vsip/impl/eval-blas.hpp,v
retrieving revision 1.1
diff -c -p -r1.1 eval-blas.hpp
*** src/vsip/impl/eval-blas.hpp	12 Oct 2005 12:45:05 -0000	1.1
--- src/vsip/impl/eval-blas.hpp	4 Nov 2005 19:32:27 -0000
*************** namespace vsip
*** 31,45 ****
  namespace impl
  {
  
! // BLAS evaluator for vector-vector dot-product (non-conjugated).
  
! template <typename T,
  	  typename Block1,
  	  typename Block2>
! struct Evaluator<Op_prod_vv, Return_scalar<T>, Op_list_2<Block1, Block2>,
  		 Blas_tag>
  {
    static bool const ct_valid = 
      impl::blas::Blas_traits<T>::valid &&
      Type_equal<T, typename Block1::value_type>::value &&
      Type_equal<T, typename Block2::value_type>::value &&
--- 31,191 ----
  namespace impl
  {
  
! // BLAS evaluator for vector-vector outer product
  
! template <typename T1,
! 	  typename Block0,
  	  typename Block1,
  	  typename Block2>
! struct Evaluator<Op_prod_vv, Block0, Op_list_3<T1, Block1, Block2>,
  		 Blas_tag>
  {
    static bool const ct_valid = 
+     impl::blas::Blas_traits<T1>::valid &&
+     Type_equal<T1, typename Block1::value_type>::value &&
+     Type_equal<T1, typename Block2::value_type>::value &&
+     // check that direct access is supported
+     Ext_data_cost<Block1>::value == 0 &&
+     Ext_data_cost<Block2>::value == 0;
+ 
+   static bool rt_valid(Block0& r, T1 alpha, Block1 const& a, Block2 const& b)
+   {
+     typedef typename Block_layout<Block1>::order_type order1_type;
+ 
+     Ext_data<Block1> ext_a(const_cast<Block1&>(a));
+     Ext_data<Block2> ext_b(const_cast<Block2&>(b));
+ 
+     return (ext_a.stride(0) == 1) && (ext_b.stride(0) == 1);
+   }
+ 
+   static void exec(Block0& r, T1 alpha, Block1 const& a, Block2 const& b)
+   {
+     assert(a.size(1, 0) == r.size(2, 0));
+     assert(b.size(1, 0) == r.size(2, 1));
+ 
+     typedef typename Block_layout<Block0>::order_type order0_type;
+ 
+     Ext_data<Block0> ext_r(const_cast<Block0&>(r));
+     Ext_data<Block1> ext_a(const_cast<Block1&>(a));
+     Ext_data<Block2> ext_b(const_cast<Block2&>(b));
+ 
+     if (Type_equal<order0_type, row2_type>::value)
+     {
+       // Use identity:
+       //   R = A B     <=>     trans(R) = trans(B) trans(A)
+ 
+       blas::ger( 
+         b.size(1, 0), a.size(1, 0),     // int m, int n,
+         alpha,                          // T alpha,
+         ext_b.data(), ext_b.stride(0),  // T *x, int incx,
+         ext_a.data(), ext_a.stride(0),  // T *y, int incy,
+         ext_r.data(), r.size(2, 1)      // T *a, int lda
+       );
+     }
+     else if (Type_equal<order0_type, col2_type>::value)
+     {
+       blas::ger( 
+         a.size(1, 0), b.size(1, 0),     // int m, int n,
+         alpha,                          // T alpha,
+         ext_a.data(), ext_a.stride(0),  // T *x, int incx,
+         ext_b.data(), ext_b.stride(0),  // T *y, int incy,
+         ext_r.data(), r.size(2, 0)      // T *a, int lda
+       );
+     }
+     else
+       assert(0);
+   }
+ };
+ 
+ template <typename T1,
+ 	  typename Block0,
+ 	  typename Block1,
+ 	  typename Block2>
+ struct Evaluator<Op_prod_vv, Block0, 
+                  Op_list_3<std::complex<T1>, Block1, Block2>,
+ 		 Blas_tag>
+ {
+   static bool const ct_valid = 
+     impl::blas::Blas_traits<std::complex<T1> >::valid &&
+     Type_equal<std::complex<T1>, typename Block1::value_type>::value &&
+     Type_equal<std::complex<T1>, typename Block2::value_type>::value &&
+     // check that direct access is supported
+     Ext_data_cost<Block1>::value == 0 &&
+     Ext_data_cost<Block2>::value == 0;
+ 
+   static bool rt_valid(Block0& r, std::complex<T1> alpha, 
+     Block1 const& a, Block2 const& b)
+   {
+     typedef typename Block_layout<Block1>::order_type order1_type;
+ 
+     Ext_data<Block1> ext_a(const_cast<Block1&>(a));
+     Ext_data<Block2> ext_b(const_cast<Block2&>(b));
+ 
+     return (ext_a.stride(0) == 1) && (ext_b.stride(0) == 1);
+   }
+ 
+   static void exec(Block0& r, std::complex<T1> alpha, 
+     Block1 const& a, Block2 const& b)
+   {
+     assert(a.size(1, 0) == r.size(2, 0));
+     assert(b.size(1, 0) == r.size(2, 1));
+ 
+     typedef typename Block_layout<Block0>::order_type order0_type;
+     typedef typename Block_layout<Block1>::order_type order1_type;
+     typedef typename Block_layout<Block2>::order_type order2_type;
+ 
+     Ext_data<Block0> ext_r(const_cast<Block0&>(r));
+     Ext_data<Block1> ext_a(const_cast<Block1&>(a));
+     Ext_data<Block2> ext_b(const_cast<Block2&>(b));
+ 
+     if (Type_equal<order0_type, row2_type>::value)
+     {
+       // BLAS does not have a function that will conjugate the first 
+       // vector and allow us to take advantage of the identity:
+       //   R = A B*     <=>     trans(R) = trans(B*) trans(A)
+       // This must be done prior to calling the library function.
+ 
+       Block2 conjb( b.size(1, 0) );
+       Ext_data<Block2> ext_conjb(const_cast<Block2&>(conjb));
+ 
+       std::complex<T1> *p_src = ext_b.data();
+       std::complex<T1> *p_dst = ext_conjb.data();
+       for ( index_type i = 0; i < b.size(1, 0); ++i )
+         *p_dst++ = conj(*p_src++);
+ 
+       blas::geru( 
+         b.size(1, 0), a.size(1, 0),     // int m, int n,
+         alpha,                          // T alpha,
+         ext_conjb.data(), ext_conjb.stride(0),  // T *x, int incx,
+         ext_a.data(), ext_a.stride(0),  // T *y, int incy,
+         ext_r.data(), r.size(2, 1)      // T *a, int lda
+       );
+     }
+     else if (Type_equal<order0_type, col2_type>::value)
+     {
+       blas::gerc( 
+         a.size(1, 0), b.size(1, 0),     // int m, int n,
+         alpha,                          // T alpha,
+         ext_a.data(), ext_a.stride(0),  // T *x, int incx,
+         ext_b.data(), ext_b.stride(0),  // T *y, int incy,
+         ext_r.data(), r.size(2, 0)      // T *a, int lda
+       );
+     }
+     else
+       assert(0);
+   }
+ };
+ 
+ 
+ // BLAS evaluator for vector-vector dot-product (non-conjugated).
+ 
+ template <typename T,
+           typename Block1,
+           typename Block2>
+ struct Evaluator<Op_prod_vv, Return_scalar<T>, Op_list_2<Block1, Block2>,
+                  Blas_tag>
+ {
+   static bool const ct_valid = 
      impl::blas::Blas_traits<T>::valid &&
      Type_equal<T, typename Block1::value_type>::value &&
      Type_equal<T, typename Block2::value_type>::value &&
*************** struct Evaluator<Op_prod_vv, Return_scal
*** 57,64 ****
      Ext_data<Block2> ext_b(const_cast<Block2&>(b));
  
      T r = blas::dot(a.size(1, 0),
! 		    ext_a.data(), ext_a.stride(0),
! 		    ext_b.data(), ext_b.stride(0));
  
      return r;
    }
--- 203,210 ----
      Ext_data<Block2> ext_b(const_cast<Block2&>(b));
  
      T r = blas::dot(a.size(1, 0),
!                     ext_a.data(), ext_a.stride(0),
!                     ext_b.data(), ext_b.stride(0));
  
      return r;
    }
*************** struct Evaluator<Op_prod_vv, Return_scal
*** 69,81 ****
  // BLAS evaluator for vector-vector dot-product (conjugated).
  
  template <typename T,
! 	  typename Block1,
! 	  typename Block2>
  struct Evaluator<Op_prod_vv, Return_scalar<complex<T> >,
! 		 Op_list_2<Block1, 
! 			   Unary_expr_block<1, impl::conj_functor,
! 					    Block2, complex<T> > const>,
! 		 Blas_tag>
  {
    static bool const ct_valid = 
      impl::blas::Blas_traits<complex<T> >::valid &&
--- 215,227 ----
  // BLAS evaluator for vector-vector dot-product (conjugated).
  
  template <typename T,
!           typename Block1,
!           typename Block2>
  struct Evaluator<Op_prod_vv, Return_scalar<complex<T> >,
!                  Op_list_2<Block1, 
!                            Unary_expr_block<1, impl::conj_functor,
!                                             Block2, complex<T> > const>,
!                  Blas_tag>
  {
    static bool const ct_valid = 
      impl::blas::Blas_traits<complex<T> >::valid &&
*************** struct Evaluator<Op_prod_vv, Return_scal
*** 100,107 ****
      Ext_data<Block2> ext_b(const_cast<Block2&>(b.op()));
  
      return blas::dotc(a.size(1, 0),
! 		      ext_b.data(), ext_b.stride(0),
! 		      ext_a.data(), ext_a.stride(0));
      // Note:
      //   BLAS    cdotc(x, y)  => conj(x) * y, while 
      //   VSIPL++ cvjdot(x, y) => x * conj(y)
--- 246,253 ----
      Ext_data<Block2> ext_b(const_cast<Block2&>(b.op()));
  
      return blas::dotc(a.size(1, 0),
!                       ext_b.data(), ext_b.stride(0),
!                       ext_a.data(), ext_a.stride(0));
      // Note:
      //   BLAS    cdotc(x, y)  => conj(x) * y, while 
      //   VSIPL++ cvjdot(x, y) => x * conj(y)
*************** struct Evaluator<Op_prod_vv, Return_scal
*** 110,122 ****
  
  
  
  // BLAS evaluator for matrix-matrix products.
  
  template <typename Block0,
! 	  typename Block1,
! 	  typename Block2>
  struct Evaluator<Op_prod_mm, Block0, Op_list_2<Block1, Block2>,
! 		 Blas_tag>
  {
    typedef typename Block0::value_type T;
  
--- 256,439 ----
  
  
  
+ // BLAS evaluator for matrix-vector product
+ template <typename Block0,
+           typename Block1,
+           typename Block2>
+ struct Evaluator<Op_prod_mv, Block0, Op_list_2<Block1, Block2>,
+                  Blas_tag>
+ {
+   typedef typename Block0::value_type T;
+ 
+   static bool const ct_valid = 
+     impl::blas::Blas_traits<T>::valid &&
+     Type_equal<T, typename Block1::value_type>::value &&
+     Type_equal<T, typename Block2::value_type>::value &&
+     // check that direct access is supported
+     Ext_data_cost<Block1>::value == 0 &&
+     Ext_data_cost<Block2>::value == 0;
+ 
+   static bool rt_valid(Block0& r, Block1 const& a, Block2 const& b)
+   {
+     typedef typename Block_layout<Block1>::order_type order1_type;
+ 
+     Ext_data<Block1> ext_a(const_cast<Block1&>(a));
+     Ext_data<Block2> ext_b(const_cast<Block2&>(b));
+ 
+     // Note: gemm is used for row type and b is restricted to unit stride.
+     // gemv is used for col type and can handle any stride for b.
+     bool is_a_row = Type_equal<order1_type, row2_type>::value;
+     return is_a_row ? ((ext_a.stride(1) == 1) && (ext_b.stride(0) == 1))
+                     : (ext_a.stride(0) == 1);
+   }
+ 
+   static void exec(Block0& r, Block1 const& a, Block2 const& b)
+   {
+     assert(a.size(2, 1) == b.size(1, 0));
+ 
+     typedef typename Block0::value_type RT;
+ 
+     typedef typename Block_layout<Block0>::order_type order0_type;
+     typedef typename Block_layout<Block1>::order_type order1_type;
+     typedef typename Block_layout<Block2>::order_type order2_type;
+ 
+     Ext_data<Block0> ext_r(const_cast<Block0&>(r));
+     Ext_data<Block1> ext_a(const_cast<Block1&>(a));
+     Ext_data<Block2> ext_b(const_cast<Block2&>(b));
+ 
+     if (Type_equal<order1_type, row2_type>::value)
+     {
+       // Use identity:
+       //   R = A B      <=>     trans(R) = trans(B) trans(A)
+       // to evaluate row-major matrix result with BLAS.
+ 
+       char transa   = 'n';           // already transposed
+       char transb   = 'n';
+ 
+       blas::gemm(
+         transa, transb,
+         1,                  // M
+         a.size(2, 0),       // N
+         a.size(2, 1),       // K
+         1.0,                // alpha
+         ext_b.data(), 1,    // vector, first dim is implicitly 1
+         ext_a.data(), a.size(2, 1),
+         0.0,                // beta
+         ext_r.data(), 1     // vector, first dim is implicitly 1
+       );
+     }
+     else if (Type_equal<order1_type, col2_type>::value)
+     {
+       char transa   = 'n';
+ 
+       blas::gemv( 
+         transa,                          // char trans,
+         a.size(2, 0), a.size(2, 1),      // int m, int n,
+         1.0,                             // T alpha,
+         ext_a.data(), a.size(2, 0),      // T *a, int lda,
+         ext_b.data(), ext_b.stride(0),   // T *x, int incx,
+         0.0,                             // T beta,
+         ext_r.data(), ext_r.stride(0)    // T *y, int incy)
+       );
+     }
+     else assert(0);
+   }
+ };
+ 
+ 
+ // BLAS evaluator for vector-matrix product
+ template <typename Block0,
+           typename Block1,
+           typename Block2>
+ struct Evaluator<Op_prod_vm, Block0, Op_list_2<Block1, Block2>,
+                  Blas_tag>
+ {
+   typedef typename Block0::value_type T;
+ 
+   static bool const ct_valid = 
+     impl::blas::Blas_traits<T>::valid &&
+     Type_equal<T, typename Block1::value_type>::value &&
+     Type_equal<T, typename Block2::value_type>::value &&
+     // check that direct access is supported
+     Ext_data_cost<Block1>::value == 0 &&
+     Ext_data_cost<Block2>::value == 0;
+ 
+   static bool rt_valid(Block0& r, Block1 const& a, Block2 const& b)
+   {
+     typedef typename Block_layout<Block2>::order_type order2_type;
+ 
+     Ext_data<Block1> ext_a(const_cast<Block1&>(a));
+     Ext_data<Block2> ext_b(const_cast<Block2&>(b));
+ 
+     // Note: gemv is used for row type and can handle any stride for a.
+     // gemm is used for col type and a is restricted to unit stride.
+     // 
+     bool is_b_row = Type_equal<order2_type, row2_type>::value;
+     return is_b_row ? (ext_b.stride(1) == 1) 
+                     : ((ext_b.stride(0) == 1) && (ext_a.stride(0) == 1));
+   }
+ 
+   static void exec(Block0& r, Block1 const& a, Block2 const& b)
+   {
+     assert(a.size(1, 0) == b.size(2, 0));
+ 
+     typedef typename Block0::value_type RT;
+ 
+     typedef typename Block_layout<Block0>::order_type order0_type;
+     typedef typename Block_layout<Block1>::order_type order1_type;
+     typedef typename Block_layout<Block2>::order_type order2_type;
+ 
+     Ext_data<Block0> ext_r(const_cast<Block0&>(r));
+     Ext_data<Block1> ext_a(const_cast<Block1&>(a));
+     Ext_data<Block2> ext_b(const_cast<Block2&>(b));
+ 
+     if (Type_equal<order2_type, row2_type>::value)
+     {
+       // Use identity:
+       //   R = A B      <=>     trans(R) = trans(B) trans(A)
+       // to evaluate row-major matrix result with BLAS.
+ 
+       char transa   = 'n';
+ 
+       blas::gemv( 
+         transa,                          // char trans,
+         b.size(2, 1), b.size(2, 0),      // int m, int n,
+         1.0,                             // T alpha,
+         ext_b.data(), b.size(2, 1),      // T *a, int lda,
+         ext_a.data(), ext_a.stride(0),   // T *x, int incx,
+         0.0,                             // T beta,
+         ext_r.data(), ext_r.stride(0)    // T *y, int incy)
+       );
+     }
+     else if (Type_equal<order2_type, col2_type>::value)
+     {
+       char transa   = 'n';
+       char transb   = 'n';
+ 
+       blas::gemm(
+         transa, transb,
+         1,                  // M
+         b.size(2, 1),       // N
+         b.size(2, 0),       // K
+         1.0,                // alpha
+         ext_a.data(), 1,    // vector, first dim is implicitly 1
+         ext_b.data(), b.size(2, 0),
+         0.0,                // beta
+         ext_r.data(), 1     // vector, first dim is implicitly 1
+       );
+     }
+     else assert(0);
+   }
+ };
+ 
+ 
  // BLAS evaluator for matrix-matrix products.
  
  template <typename Block0,
!           typename Block1,
!           typename Block2>
  struct Evaluator<Op_prod_mm, Block0, Op_list_2<Block1, Block2>,
!                  Blas_tag>
  {
    typedef typename Block0::value_type T;
  
*************** struct Evaluator<Op_prod_mm, Block0, Op_
*** 131,136 ****
--- 448,470 ----
  
    static bool rt_valid(Block0& r, Block1 const& a, Block2 const& b)
    {
+     typedef typename Block_layout<Block1>::order_type order1_type;
+     typedef typename Block_layout<Block2>::order_type order2_type;
+ 
+     Ext_data<Block1> ext_a(const_cast<Block1&>(a));
+     Ext_data<Block2> ext_b(const_cast<Block2&>(b));
+ 
+     bool is_a_row = Type_equal<order1_type, row2_type>::value;
+     bool is_b_row = Type_equal<order2_type, row2_type>::value;
+ 
+     return ( ext_a.stride(is_a_row ? 1 : 0) == 1 &&
+              ext_b.stride(is_b_row ? 1 : 0) == 1 );
+   }
+ 
+   static void exec(Block0& r, Block1 const& a, Block2 const& b)
+   {
+     typedef typename Block0::value_type RT;
+ 
      typedef typename Block_layout<Block0>::order_type order0_type;
      typedef typename Block_layout<Block1>::order_type order1_type;
      typedef typename Block_layout<Block2>::order_type order2_type;
*************** struct Evaluator<Op_prod_mm, Block0, Op_
*** 139,155 ****
      Ext_data<Block1> ext_a(const_cast<Block1&>(a));
      Ext_data<Block2> ext_b(const_cast<Block2&>(b));
  
!     bool is_r_row = Type_equal<order0_type, row2_type>::value;
      bool is_a_row = Type_equal<order1_type, row2_type>::value;
      bool is_b_row = Type_equal<order2_type, row2_type>::value;
  
!     return is_r_row ? (ext_a.stride(is_a_row ? 1 : 0) == 1 &&
! 		       ext_b.stride(is_b_row ? 1 : 0) == 1)
!                     : (ext_a.stride(is_a_row ? 0 : 1) == 1 &&
! 		       ext_b.stride(is_b_row ? 0 : 1) == 1);
    }
  
!   static void exec(Block0& r, Block1 const& a, Block2 const& b)
    {
      typedef typename Block0::value_type RT;
  
--- 473,565 ----
      Ext_data<Block1> ext_a(const_cast<Block1&>(a));
      Ext_data<Block2> ext_b(const_cast<Block2&>(b));
  
!     if (Type_equal<order0_type, row2_type>::value)
!     {
!       bool is_a_row = Type_equal<order1_type, row2_type>::value;
!       char transa   = is_a_row ? 'n' : 't';
!       int  lda      = is_a_row ? ext_a.stride(0) : ext_a.stride(1);
! 
!       bool is_b_row = Type_equal<order2_type, row2_type>::value;
!       char transb   = is_b_row ? 'n' : 't';
!       int  ldb      = is_b_row ? ext_b.stride(0) : ext_b.stride(1);
! 
!       // Use identity:
!       //   R = A B      <=>     trans(R) = trans(B) trans(A)
!       // to evaluate row-major matrix result with BLAS.
! 
!       blas::gemm(transb, transa,
!                  b.size(2, 1),  // N
!                  a.size(2, 0),  // M
!                  a.size(2, 1),  // K
!                  1.0,           // alpha
!                  ext_b.data(), ldb,
!                  ext_a.data(), lda,
!                  0.0,           // beta
!                  ext_r.data(), ext_r.stride(0));
!     }
!     else if (Type_equal<order0_type, col2_type>::value)
!     {
!       bool is_a_col = Type_equal<order1_type, col2_type>::value;
!       char transa   = is_a_col ? 'n' : 't';
!       int  lda      = is_a_col ? ext_a.stride(1) : ext_a.stride(0);
! 
!       bool is_b_col = Type_equal<order2_type, col2_type>::value;
!       char transb   = is_b_col ? 'n' : 't';
!       int  ldb      = is_b_col ? ext_b.stride(1) : ext_b.stride(0);
! 
!       blas::gemm(transa, transb,
!                  a.size(2, 0),  // M
!                  b.size(2, 1),  // N
!                  a.size(2, 1),  // K
!                  1.0,           // alpha
!                  ext_a.data(), lda,
!                  ext_b.data(), ldb,
!                  0.0,           // beta
!                  ext_r.data(), ext_r.stride(1));
!     }
!     else assert(0);
!   }
! };
! 
! 
! // BLAS evaluator for generalized matrix-matrix products.
! 
! template <typename T1,
! 	  typename T2,
! 	  typename Block0,
!           typename Block1,
!           typename Block2>
! struct Evaluator<Op_prod_mm, Block0, Op_list_4<T1, Block1, Block2, T2>,
!                  Blas_tag>
! {
!   typedef typename Block0::value_type T;
! 
!   static bool const ct_valid = 
!     impl::blas::Blas_traits<T>::valid &&
!     Type_equal<T, typename Block1::value_type>::value &&
!     Type_equal<T, typename Block2::value_type>::value &&
!     // check that direct access is supported
!     Ext_data_cost<Block0>::value == 0 &&
!     Ext_data_cost<Block1>::value == 0 &&
!     Ext_data_cost<Block2>::value == 0;
! 
!   static bool rt_valid(Block0& r, T1, Block1 const& a, Block2 const& b, T2)
!   {
!     typedef typename Block_layout<Block1>::order_type order1_type;
!     typedef typename Block_layout<Block2>::order_type order2_type;
! 
!     Ext_data<Block1> ext_a(const_cast<Block1&>(a));
!     Ext_data<Block2> ext_b(const_cast<Block2&>(b));
! 
      bool is_a_row = Type_equal<order1_type, row2_type>::value;
      bool is_b_row = Type_equal<order2_type, row2_type>::value;
  
!     return ( ext_a.stride(is_a_row ? 1 : 0) == 1 &&
!              ext_b.stride(is_b_row ? 1 : 0) == 1 );
    }
  
!   static void exec(Block0& r, T1 alpha, Block1 const& a, 
!     Block2 const& b, T2 beta)
    {
      typedef typename Block0::value_type RT;
  
*************** struct Evaluator<Op_prod_mm, Block0, Op_
*** 172,189 ****
        int  ldb      = is_b_row ? ext_b.stride(0) : ext_b.stride(1);
  
        // Use identity:
!       //   R = A B	<=>	trans(R) = trans(B) trans(A)
        // to evaluate row-major matrix result with BLAS.
  
        blas::gemm(transb, transa,
! 		 b.size(2, 1),	// N
! 		 a.size(2, 0),	// M
! 		 a.size(2, 1),	// K
! 		 1.0,		// alpha
! 		 ext_b.data(), ldb,
! 		 ext_a.data(), lda,
! 		 0.0,		// beta
! 		 ext_r.data(), ext_r.stride(0));
      }
      else if (Type_equal<order0_type, col2_type>::value)
      {
--- 582,599 ----
        int  ldb      = is_b_row ? ext_b.stride(0) : ext_b.stride(1);
  
        // Use identity:
!       //   R = A B      <=>     trans(R) = trans(B) trans(A)
        // to evaluate row-major matrix result with BLAS.
  
        blas::gemm(transb, transa,
!                  b.size(2, 1),  // N
!                  a.size(2, 0),  // M
!                  a.size(2, 1),  // K
!                  alpha,         // alpha
!                  ext_b.data(), ldb,
!                  ext_a.data(), lda,
!                  beta,          // beta
!                  ext_r.data(), ext_r.stride(0));
      }
      else if (Type_equal<order0_type, col2_type>::value)
      {
*************** struct Evaluator<Op_prod_mm, Block0, Op_
*** 196,209 ****
        int  ldb      = is_b_col ? ext_b.stride(1) : ext_b.stride(0);
  
        blas::gemm(transa, transb,
! 		 a.size(2, 0),	// M
! 		 b.size(2, 1),	// N
! 		 a.size(2, 1),	// K
! 		 1.0,		// alpha
! 		 ext_a.data(), lda,
! 		 ext_b.data(), ldb,
! 		 0.0,		// beta
! 		 ext_r.data(), ext_r.stride(1));
      }
      else assert(0);
    }
--- 606,619 ----
        int  ldb      = is_b_col ? ext_b.stride(1) : ext_b.stride(0);
  
        blas::gemm(transa, transb,
!                  a.size(2, 0),  // M
!                  b.size(2, 1),  // N
!                  a.size(2, 1),  // K
!                  alpha,         // alpha
!                  ext_a.data(), lda,
!                  ext_b.data(), ldb,
!                  beta,          // beta
!                  ext_r.data(), ext_r.stride(1));
      }
      else assert(0);
    }
*************** struct Evaluator<Op_prod_mm, Block0, Op_
*** 211,216 ****
--- 621,627 ----
  
  
  
+ 
  } // namespace vsip::impl
  
  } // namespace vsip
Index: src/vsip/impl/general_dispatch.hpp
===================================================================
RCS file: /home/cvs/Repository/vpp/src/vsip/impl/general_dispatch.hpp,v
retrieving revision 1.1
diff -c -p -r1.1 general_dispatch.hpp
*** src/vsip/impl/general_dispatch.hpp	12 Oct 2005 12:45:05 -0000	1.1
--- src/vsip/impl/general_dispatch.hpp	4 Nov 2005 19:32:27 -0000
*************** namespace impl
*** 35,40 ****
--- 35,42 ----
  
  struct Op_prod_vv;	// vector-vector dot-product
  struct Op_prod_mm;	// matrix-matrix product
+ struct Op_prod_mv;	// matrix-vector product
+ struct Op_prod_vm;	// vector-matrix product
  
  
  
*************** struct Op_prod_mm;	// matrix-matrix prod
*** 46,51 ****
--- 48,54 ----
  struct Blas_tag;		// BLAS implementation (ATLAS, MKL, etc)
  struct Intel_ipp_tag;		// Intel IPP library.
  struct Generic_tag;		// Generic implementation.
+ struct Mercury_sal_tag;		// Mercury SAL library.
  
  
  
*************** template <typename T> struct Return_scal
*** 59,64 ****
--- 62,71 ----
  
  template <typename Block1>                  struct Op_list_1 {};
  template <typename Block1, typename Block2> struct Op_list_2 {};
+ template <typename T0, typename Block1, 
+           typename Block2>                  struct Op_list_3 {};
+ template <typename T0, typename Block1, 
+           typename Block2, typename T3>     struct Op_list_4 {};
  
  
  
*************** struct Evaluator
*** 78,84 ****
  template <typename OpTag>
  struct Dispatch_order
  {
!   typedef typename Make_type_list<Blas_tag, Generic_tag>::type type;
  };
  
  
--- 85,93 ----
  template <typename OpTag>
  struct Dispatch_order
  {
!   typedef typename Make_type_list<
!       Blas_tag, Mercury_sal_tag, Generic_tag 
!     >::type type;
  };
  
  
*************** struct General_dispatch<OpTag, DstBlock,
*** 224,229 ****
--- 233,349 ----
    }
  };
  
+ 
+ /***********************************************************************
+   General_dispatch - 2-op block return specialization, one parameter
+ ***********************************************************************/
+ 
+ /// In case the compile-time check passes, we decide at run-time whether
+ /// or not to use this backend.
+ template <typename OpTag,
+ 	  typename DstBlock,
+           typename T0,
+ 	  typename Block1,
+ 	  typename Block2,
+ 	  typename TagList,
+ 	  typename Tag,
+ 	  typename Rest,
+ 	  typename EvalExpr>
+ struct General_dispatch<OpTag, DstBlock, Op_list_3<T0, Block1, Block2>,
+                        TagList, Tag, Rest, EvalExpr, true>
+ {
+   static void exec(DstBlock& res, T0 param1, Block1 const& op1, Block2 const& op2)
+   {
+     if (EvalExpr::rt_valid(res, param1, op1, op2))
+       EvalExpr::exec(res, param1, op1, op2);
+     else
+       General_dispatch<OpTag, DstBlock, Op_list_3<T0, Block1, Block2>, Rest>
+ 		::exec(res, param1, op1, op2);
+   }
+ };
+ 
+ 
+ 
+ /// Terminator. Instead of passing on to the next element
+ /// it aborts the program. It is a program error to define
+ /// callback lists that can't handle a given expression.
+ template <typename OpTag,
+ 	  typename DstBlock,
+           typename T0,
+ 	  typename Block1,
+ 	  typename Block2,
+ 	  typename TagList,
+ 	  typename Tag,
+ 	  typename EvalExpr>
+ struct General_dispatch<OpTag, DstBlock, Op_list_3<T0, Block1, Block2>,
+ 			TagList, Tag, None_type, EvalExpr, true>
+ {
+   static void exec(DstBlock& res, T0 param1, Block1 const& op1, Block2 const& op2)
+   {
+     if (EvalExpr::rt_valid(res, param1, op1, op2))
+       EvalExpr::exec(res, param1, op1, op2);
+     else
+       assert(0);
+   }
+ };
+ 
+ 
+ 
+ /***********************************************************************
+   General_dispatch - 2-op block return specialization, two parameters
+ ***********************************************************************/
+ 
+ /// In case the compile-time check passes, we decide at run-time whether
+ /// or not to use this backend.
+ template <typename OpTag,
+ 	  typename DstBlock,
+           typename T0,
+ 	  typename Block1,
+ 	  typename Block2,
+           typename T3,
+ 	  typename TagList,
+ 	  typename Tag,
+ 	  typename Rest,
+ 	  typename EvalExpr>
+ struct General_dispatch<OpTag, DstBlock, Op_list_4<T0, Block1, Block2, T3>,
+                        TagList, Tag, Rest, EvalExpr, true>
+ {
+   static void exec(DstBlock& res, T0 param1, Block1 const& op1, Block2 const& op2, T3 param2)
+   {
+     if (EvalExpr::rt_valid(res, param1, op1, op2, param2))
+       EvalExpr::exec(res, param1, op1, op2, param2);
+     else
+       General_dispatch<OpTag, DstBlock, Op_list_4<T0, Block1, Block2, T3>, Rest>
+ 		::exec(res, param1, op1, op2, param2);
+   }
+ };
+ 
+ 
+ 
+ /// Terminator. Instead of passing on to the next element
+ /// it aborts the program. It is a program error to define
+ /// callback lists that can't handle a given expression.
+ template <typename OpTag,
+ 	  typename DstBlock,
+           typename T0,
+ 	  typename Block1,
+ 	  typename Block2,
+           typename T3,
+ 	  typename TagList,
+ 	  typename Tag,
+ 	  typename EvalExpr>
+ struct General_dispatch<OpTag, DstBlock, Op_list_4<T0, Block1, Block2, T3>,
+ 			TagList, Tag, None_type, EvalExpr, true>
+ {
+   static void exec(DstBlock& res, T0 param1, Block1 const& op1, Block2 const& op2, T0 param2)
+   {
+     if (EvalExpr::rt_valid(res, param1, op1, op2, param2))
+       EvalExpr::exec(res, param1, op1, op2, param2);
+     else
+       assert(0);
+   }
+ };
+ 
  } // namespace vsip::impl
  } // namespace vsip
  
Index: src/vsip/impl/lapack.hpp
===================================================================
RCS file: /home/cvs/Repository/vpp/src/vsip/impl/lapack.hpp,v
retrieving revision 1.9
diff -c -p -r1.9 lapack.hpp
*** src/vsip/impl/lapack.hpp	28 Oct 2005 16:37:46 -0000	1.9
--- src/vsip/impl/lapack.hpp	4 Nov 2005 19:32:27 -0000
*************** extern "C"
*** 71,81 ****
--- 71,96 ----
    void ctrsm_ (char*, char*, char*, char*, I, I, C, C, I, C, I);
    void ztrsm_ (char*, char*, char*, char*, I, I, Z, Z, I, Z, I);
  
+   // gemv
+   void sgemv_(char*, I, I, S, S, I, S, I, S, S, I);
+   void dgemv_(char*, I, I, D, D, I, D, I, D, D, I);
+   void cgemv_(char*, I, I, C, C, I, C, I, C, C, I);
+   void zgemv_(char*, I, I, Z, Z, I, Z, I, Z, Z, I);
+ 
    // gemm
    void sgemm_(char*, char*, I, I, I, S, S, I, S, I, S, S, I);
    void dgemm_(char*, char*, I, I, I, D, D, I, D, I, D, D, I);
    void cgemm_(char*, char*, I, I, I, C, C, I, C, I, C, C, I);
    void zgemm_(char*, char*, I, I, I, Z, Z, I, Z, I, Z, Z, I);
+ 
+   // ger
+   void sger_  ( I, I, S, S, I, S, I, S, I );
+   void dger_  ( I, I, D, D, I, D, I, D, I );
+   void cgerc_ ( I, I, C, C, I, C, I, C, I );
+   void zgerc_ ( I, I, Z, Z, I, Z, I, Z, I );
+   void cgeru_ ( I, I, C, C, I, C, I, C, I );
+   void zgeru_ ( I, I, Z, Z, I, Z, I, Z, I );
+ 
  };
  
  #define VSIP_IMPL_BLAS_DOT(T, VPPFCN, BLASFCN)				\
*************** VSIP_IMPL_BLAS_TRSM(std::complex<double>
*** 140,145 ****
--- 155,188 ----
  
  
  
+ #define VSIP_IMPL_BLAS_GEMV(T, FCN)					\
+ inline void								\
+ gemv(char transa,            						\
+      int m, int n,       						\
+      T alpha,								\
+      T *a, int lda,							\
+      T *x, int incx,							\
+      T beta,								\
+      T *y, int incy)							\
+ {									\
+   FCN(&transa,            						\
+       &m, &n,    							\
+       &alpha,								\
+       a, &lda,								\
+       x, &incx,								\
+       &beta,								\
+       y, &incy);							\
+ }
+ 
+ VSIP_IMPL_BLAS_GEMV(float,                sgemv_)
+ VSIP_IMPL_BLAS_GEMV(double,               dgemv_)
+ VSIP_IMPL_BLAS_GEMV(std::complex<float>,  cgemv_)
+ VSIP_IMPL_BLAS_GEMV(std::complex<double>, zgemv_)
+ 
+ #undef VSIP_IMPL_BLAS_GEMV
+ 
+ 
+ 
  #define VSIP_IMPL_BLAS_GEMM(T, FCN)					\
  inline void								\
  gemm(char transa, char transb,						\
*************** VSIP_IMPL_BLAS_GEMM(std::complex<double>
*** 168,173 ****
--- 211,242 ----
  
  
  
+ #define VSIP_IMPL_BLAS_GER(T, VPPFCN, FCN)     	\
+ inline void					\
+ VPPFCN( int m, int n,           		\
+      T alpha,					\
+      T *x, int incx,				\
+      T *y, int incy,				\
+      T *a, int lda)				\
+ {						\
+   FCN(&m, &n,    				\
+       &alpha,					\
+       x, &incx,					\
+       y, &incy,					\
+       a, &lda);					\
+ }
+ 
+ VSIP_IMPL_BLAS_GER(float,                ger, sger_)
+ VSIP_IMPL_BLAS_GER(double,               ger, dger_)
+ VSIP_IMPL_BLAS_GER(std::complex<float>,  gerc, cgerc_)
+ VSIP_IMPL_BLAS_GER(std::complex<double>, gerc, zgerc_)
+ VSIP_IMPL_BLAS_GER(std::complex<float>,  geru, cgeru_)
+ VSIP_IMPL_BLAS_GER(std::complex<double>, geru, zgeru_)
+ 
+ #undef VSIP_IMPL_BLAS_GER
+ 
+ 
+ 
  template <typename T>
  struct Blas_traits
  {
Index: src/vsip/impl/matvec-prod.hpp
===================================================================
RCS file: /home/cvs/Repository/vpp/src/vsip/impl/matvec-prod.hpp,v
retrieving revision 1.3
diff -c -p -r1.3 matvec-prod.hpp
*** src/vsip/impl/matvec-prod.hpp	12 Oct 2005 12:45:05 -0000	1.3
--- src/vsip/impl/matvec-prod.hpp	4 Nov 2005 19:32:27 -0000
*************** generic_prod(
*** 90,95 ****
--- 90,124 ----
  
  
  
+ // Generic evaluator for matrix-vector products.
+ 
+ template <typename Block0,
+ 	  typename Block1,
+ 	  typename Block2>
+ struct Evaluator<Op_prod_mv, Block0, Op_list_2<Block1, Block2>,
+ 		 Generic_tag>
+ {
+   static bool const ct_valid = true;
+   static bool rt_valid(Block0&, Block1 const&, Block2 const&)
+   { return true; }
+ 
+   static void exec(Block0& r, Block1 const& a, Block2 const& b)
+   {
+     typedef typename Block0::value_type RT;
+ 
+     for (index_type i=0; i<r.size(1, 0); ++i)
+     {
+       RT sum = RT();
+       for (index_type k=0; k<a.size(2, 1); ++k)
+       {
+         sum += a.get(i, k) * b.get(k);
+       }
+       r.put(i, sum);
+     }
+   }
+ };
+ 
+ 
  /// Matrix-vector product.
  
  template <typename T0,
*************** generic_prod(
*** 107,123 ****
    assert(r.size() == a.size(0));
    assert(a.size(1) == b.size());
  
!   for (index_type i=0; i<r.size(0); ++i)
    {
!     T2 sum = T2();
!     for (index_type k=0; k<a.size(1); ++k)
      {
!       sum += a.get(i, k) * b.get(k);
      }
-     r.put(i, sum);
    }
! }
! 
  
  
  /// Vector-matrix product.
--- 136,179 ----
    assert(r.size() == a.size(0));
    assert(a.size(1) == b.size());
  
!   impl::General_dispatch<
! 		impl::Op_prod_mv,
! 		Block2,
!                 impl::Op_list_2<Block0, Block1>,
!                 typename impl::Dispatch_order<impl::Op_prod_mv>::type >
! 	::exec(r.block(), a.block(), b.block());
! }
! 
! 
! 
! 
! // Generic evaluator for vector-matrix products.
! 
! template <typename Block0,
! 	  typename Block1,
! 	  typename Block2>
! struct Evaluator<Op_prod_vm, Block0, Op_list_2<Block1, Block2>,
! 		 Generic_tag>
! {
!   static bool const ct_valid = true;
!   static bool rt_valid(Block0&, Block1 const&, Block2 const&)
!   { return true; }
! 
!   static void exec(Block0& r, Block1 const& a, Block2 const& b)
    {
!     typedef typename Block0::value_type RT;
! 
!     for (index_type i=0; i<r.size(); ++i)
      {
!       RT sum = RT();
!       for (index_type k=0; k<b.size(2, 0); ++k)
!       {
!         sum += a.get(k) * b.get(k, i);
!       }
!       r.put(i, sum);
      }
    }
! };
  
  
  /// Vector-matrix product.
*************** generic_prod(
*** 137,151 ****
    assert(r.size() == b.size(1));
    assert(a.size() == b.size(0));
  
!   for (index_type i=0; i<r.size(0); ++i)
!   {
!     T2 sum = T2();
!     for (index_type k=0; k<b.size(0); ++k)
!     {
!       sum += a.get(k) * b.get(k, i);
!     }
!     r.put(i, sum);
!   }
  }
  
  } // namespace vsip::impl
--- 193,204 ----
    assert(r.size() == b.size(1));
    assert(a.size() == b.size(0));
  
!   impl::General_dispatch<
! 		impl::Op_prod_vm,
! 		Block2,
!                 impl::Op_list_2<Block0, Block1>,
!                 typename impl::Dispatch_order<impl::Op_prod_vm>::type >
! 	::exec(r.block(), a.block(), b.block());
  }
  
  } // namespace vsip::impl
Index: src/vsip/impl/matvec.hpp
===================================================================
RCS file: /home/cvs/Repository/vpp/src/vsip/impl/matvec.hpp,v
retrieving revision 1.5
diff -c -p -r1.5 matvec.hpp
*** src/vsip/impl/matvec.hpp	12 Oct 2005 12:45:05 -0000	1.5
--- src/vsip/impl/matvec.hpp	4 Nov 2005 19:32:27 -0000
***************
*** 23,29 ****
  #include <vsip/impl/eval-blas.hpp>
  
  
- 
  namespace vsip
  {
  
--- 23,28 ----
*************** namespace impl
*** 32,37 ****
--- 31,113 ----
  
  
  
+ // Generic evaluator for outer product
+ 
+ template <typename T1,
+ 	  typename Block0,
+ 	  typename Block1,
+ 	  typename Block2>
+ struct Evaluator<Op_prod_vv, Block0, Op_list_3<T1, Block1, Block2>,
+ 		 Generic_tag>
+ {
+   static bool const ct_valid = true;
+   static bool rt_valid(Block0&, T1 alpha, Block1 const&, Block2 const&) 
+     { return true; }
+ 
+   static void exec(Block0& r, T1 alpha, Block1 const& a, Block2 const& b)
+   {
+     assert(a.size(1, 0) == r.size(2, 0));
+     assert(b.size(1, 0) == r.size(2, 1));
+ 
+     // r(i, j) = alpha * a(i) * b(j)
+     for ( index_type i = a.size(); i-- > 0; )
+       for ( index_type j = b.size(); j-- > 0; )
+         r.put( i, j, alpha * a.get(i) * b.get(j) );
+   }
+ };
+ 
+ 
+ template <typename T1,
+ 	  typename Block0,
+ 	  typename Block1,
+ 	  typename Block2>
+ struct Evaluator<Op_prod_vv, Block0, Op_list_3<std::complex<T1>, Block1, Block2>,
+ 		 Generic_tag>
+ {
+   static bool const ct_valid = true;
+   static bool rt_valid(Block0&, std::complex<T1> alpha, Block1 const&, Block2 const&) 
+     { return true; }
+ 
+   static void exec(Block0& r, std::complex<T1> alpha, Block1 const& a, Block2 const& b)
+   {
+     assert(a.size(1, 0) == r.size(2, 0));
+     assert(b.size(1, 0) == r.size(2, 1));
+ 
+     // r(i, j) = alpha * a(i) * b(j)
+     for ( index_type i = a.size(); i-- > 0; )
+       for ( index_type j = b.size(); j-- > 0; )
+         r.put( i, j, alpha * a.get(i) * conj(b.get(j)) );
+   }
+ };
+ 
+ 
+ /// Outer product
+ 
+ template <typename T0,
+ 	  typename T1,
+ 	  typename T2,
+ 	  typename Block0,
+ 	  typename Block1,
+ 	  typename Block2>
+ void
+ outer(
+   T0 alpha, 
+   const_Vector<T0, Block0> a,
+   const_Vector<T1, Block1> b,
+   Matrix<T2, Block2>       r)
+ {
+   assert(r.size(0) == a.size());
+   assert(r.size(1) == b.size());
+ 
+   impl::General_dispatch<
+ 		impl::Op_prod_vv,
+ 		Block2,
+                 impl::Op_list_3<T0, Block0, Block1>,
+                 typename impl::Dispatch_order<impl::Op_prod_vv>::type >
+ 	::exec(r.block(), alpha, a.block(), b.block());
+ }
+ 
+ 
  // Generic evaluator for vector-vector dot-product.
  
  template <typename T,
*************** kron( T0 alpha, Matrix<T1, Block1> v, Ma
*** 105,127 ****
  }
  
  
  
! template <typename T0, typename T1, typename T2, typename T3, typename T4,
!   typename Block1, typename Block2, typename Block4>
! void 
! gemp( T0 alpha, const_Matrix<T1, Block1> A,
!   const_Matrix<T2, Block2> B, T3 beta, Matrix<T4, Block4> C) 
  {
!   assert( A.size(0) == C.size(0) );
!   assert( B.size(1) == C.size(1) );
!   assert( A.size(1) == B.size(0) );  
  
!   for ( index_type i = A.size(0); i-- > 0; )
!     for ( index_type j = B.size(1); j-- > 0; )
!       C.put(i, j, alpha * dot( A.row(i), B.col(j) ) + beta * C.get(i, j));
  }
  
  
  template <typename T0, typename T1, typename T3, typename T4,
    typename Block1, typename Block4>
  void 
--- 181,249 ----
  }
  
  
+ // Generic evaluator for general product
  
! template <typename T1,
! 	  typename T2,
! 	  typename Block0,
! 	  typename Block1,
! 	  typename Block2>
! struct Evaluator<Op_prod_mm, Block0, Op_list_4<T1, Block1, Block2, T2>,
! 		 Generic_tag>
  {
!   static bool const ct_valid = true;
!   static bool rt_valid(Block0&, T1, Block1 const&, Block2 const&, T2) 
!     { return true; }
! 
!   static void exec(Block0& c, T1 alpha, Block1 const& a, 
!     Block2 const& b, T2 beta)
!   {
!     assert( a.size(2, 0) == c.size(2, 0) );
!     assert( b.size(2, 1) == c.size(2, 1) );
!     assert( a.size(2, 1) == b.size(2, 0) );  
!     
!     // c(i,j) = alpha * a(i) * b(j) + beta * c(i,j)
!     for ( index_type i = a.size(2, 0); i-- > 0; )
!     {
!       for ( index_type j = b.size(2, 1); j-- > 0; )
!       {
!         T1 dot = T1();
!         for ( index_type k = 0; k < a.size(2, 1); ++k )
!           dot += a.get(i, k) * b.get(k, j);
! 
!         c.put(i, j, alpha * dot + beta * c.get(i, j));
!       }
!     }
!   }
! };
  
! /// General matrix product
! 
! template <typename T0,
! 	  typename T1,
! 	  typename T2,
! 	  typename T4,
! 	  typename Block1,
! 	  typename Block2,
! 	  typename Block4>
! void
! gemp(
!   T0 alpha, const_Matrix<T1, Block1> a,
!   const_Matrix<T2, Block2> b, T0 beta, Matrix<T4, Block4> c) 
! {
!   assert(c.size(0) == a.size(0));
!   assert(c.size(1) == b.size(1));
! 
!   impl::General_dispatch<
! 		impl::Op_prod_mm,
! 		Block4,
!                 impl::Op_list_4<T0, Block1, Block2, T0>,
!                 typename impl::Dispatch_order<impl::Op_prod_mm>::type >
! 	::exec(c.block(), alpha, a.block(), b.block(), beta);
  }
  
  
+ 
  template <typename T0, typename T1, typename T3, typename T4,
    typename Block1, typename Block4>
  void 
*************** const_Matrix<typename Promotion<T0, type
*** 461,497 ****
  outer( T0 alpha, const_Vector<T1, Block1> v, const_Vector<T2, Block2> w )
      VSIP_NOTHROW
  {
!   typedef Matrix<typename Promotion<T0, 
!     typename Promotion<T1, T2>::type>::type> return_type;
!   return_type r( v.size(), w.size(), alpha );
! 
!   for ( index_type i = v.size(); i-- > 0; )
!     for ( index_type j = w.size(); j-- > 0; )
!       r.put( i, j, alpha * v.get(i) * w.get(j) );
! 
!   return r;
! }
  
! /// outer product of two complex vectors
! template <typename T0,
!           typename T1,
!           typename T2,
!           typename Block1,
!           typename Block2>
! const_Matrix<typename Promotion<T0, 
!   typename Promotion<std::complex<T1>, std::complex<T2> >::type>::type>
! outer( T0 alpha, const_Vector<std::complex<T1>, Block1> v, 
!                  const_Vector<std::complex<T2>, Block2> w )
!     VSIP_NOTHROW
! {
!   typedef Matrix<typename Promotion<T0, 
!     typename Promotion<std::complex<T1>, std::complex<T2> >::type
!         >::type> return_type;
!   return_type r( v.size(), w.size(), alpha );
! 
!   for ( index_type i = v.size(); i-- > 0; )
!     for ( index_type j = w.size(); j-- > 0; )
!       r.put( i, j, alpha * v.get(i) * conj(w.get(j)) );
  
    return r;
  }
--- 583,592 ----
  outer( T0 alpha, const_Vector<T1, Block1> v, const_Vector<T2, Block2> w )
      VSIP_NOTHROW
  {
!   typedef typename Promotion<T1, T2>::type return_type;
!   Matrix<return_type> r(v.size(), w.size(), return_type());
  
!   impl::outer(alpha, v, w, r);
  
    return r;
  }