Actions
|
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [patch] Mercury bits, performance bits
This patch covers fixes and improvements for running on the mercury. It also includes some of the performance work done for the LM site visit. -- Jules -- Jules Bergmann CodeSourcery jules@xxxxxxxxxxxxxxxx (650) 331-3385 x705 Index: ChangeLog
===================================================================
RCS file: /home/cvs/Repository/vpp/ChangeLog,v
retrieving revision 1.399
diff -u -r1.399 ChangeLog
--- ChangeLog 27 Feb 2006 15:07:12 -0000 1.399
+++ ChangeLog 28 Feb 2006 22:17:48 -0000
@@ -1,3 +1,71 @@
+2006-02-14 Jules Bergmann <jules@xxxxxxxxxxxxxxxx>
+
+ * autogen.sh: Doctor generated configure file to ignore extra
+ files generated by GreenHills compiler when detecting object
+ and executable exentions.
+ * configure.ac: New '--with-simd=TYPE' option to enable use of
+ SIMD extensions. Search for ippm library.
+
+ * apps/sarsim/histcmp.c: Define min() macro.
+ * apps/sarsim/util_io.c: Fix signed/unsigned warnings.
+
+ * benchmarks/conv.cpp: Remove unnecessary code. Add mem_per_point
+ function.
+ * benchmarks/conv_ipp.cpp: Add mem_per_point() function.
+ * benchmarks/copy.cpp: Likewise.
+ * benchmarks/fft_ipp.cpp: Likewise.
+ * benchmarks/qrd.cpp: Likewise.
+ * benchmarks/vmul_c.cpp: Likewise.
+ * benchmarks/vmul_ipp.cpp: Likewise.
+ * benchmarks/fastconv.cpp: Add in-place phase fast convolution
+ variants. Add mem_per_point function. Make std::log() calls
+ with float argument.
+ * benchmarks/fft.cpp: Add variants with scale != 1.0 to stress
+ scaling performance. Add mem_per_point function.
+ * benchmarks/loop.hpp: Add "left-hand side" metrics (either
+ points or memory size in bytes). Option to show time. Add
+ sweep mode.
+ * benchmarks/main.cpp: Likewise.
+ * benchmarks/mcopy.cpp: Extend benchmark functionality.
+ * benchmarks/mcopy_ipp.cpp: New benchmark for IPP matrix copy
+ and transpose.
+ * benchmarks/mpi_alltoall.cpp: New benchmark for MPI alltoall
+ functionality.
+ * benchmarks/vmul.cpp: Add cases for vector subviews, real-complex
+ vector multiply, and real-complex scalar-view multiply.
+
+ * doc/csl-docbook/GNUmakefile.inc: Robustify installation of HTML.
+ * src/vsip/GNUmakefile.inc.in: Add simd sources.
+ * src/vsip/support.hpp: Don't define noinline attribute for Greenhills.
+ * src/vsip/vector.hpp: Send scalar op= through dispatch.
+ * src/vsip/impl/distributed-block.hpp: Fix GCC 4.1 Wall warnings.
+ * src/vsip/impl/eval-sal.hpp: Fix Op_list_3 usage, fix Wall warnings.
+ * src/vsip/impl/expr_serial_dispatch.hpp: Hooks for dispatch to SIMD.
+ * src/vsip/impl/expr_serial_evaluator.hpp: Hold view extents in const
+ variable.
+ * src/vsip/impl/lvalue-proxy.hpp: Remove unnecessary implicit
+ conversion operator when deriving from std::complex<>.
+ * src/vsip/impl/par-chain-assign.hpp: Add profile events.
+ * src/vsip/impl/par-foreach.hpp: Fix submatrix typenames.
+ * src/vsip/impl/sal.cpp: Add wrappers for real*complex cases.
+ * src/vsip/impl/sal.hpp: Extend coverage of vector multiply
+ to mixed types and scalar-view cases.
+ * src/vsip/impl/signal-conv-sal.hpp: Fixed Wall warnings.
+
+ * src/vsip/impl/simd/eval-generic.hpp: New file, dispatch
+ evaluators for generic SIMD routines.
+ * src/vsip/impl/simd/simd.hpp: New file, SIMD definitions.
+ * src/vsip/impl/simd/vmul.cpp: New file, SIMD vector-multiply impl.
+ * src/vsip/impl/simd/vmul.hpp: New file, SIMD vector-multiply decls.
+
+ * tests/test.hpp: Define VSIP_IMPL_TEST_LEVEL.
+ * tests/convolution.cpp: Reduce compilation time when
+ VSIP_IMPL_TEST_LEVEL is 0.
+ * tests/extdata-subviews.cpp: Likewise.
+ * tests/scalar-view.cpp: Likewise.
+ * tests/regr_transpose.cpp: New file, regression test for
+ greenhills tail-recursion optimization defect.
+
2006-02-27 Stefan Seefeld <stefan@xxxxxxxxxxxxxxxx>
* tests/fft.cpp: Add tests for complex split format.
Index: autogen.sh
===================================================================
RCS file: /home/cvs/Repository/vpp/autogen.sh,v
retrieving revision 1.3
diff -u -r1.3 autogen.sh
--- autogen.sh 5 Dec 2005 15:16:15 -0000 1.3
+++ autogen.sh 28 Feb 2006 22:17:49 -0000
@@ -6,6 +6,12 @@
# Generate 'configure' from 'configure.ac'
autoconf
+# Tell configure to ignore non-executable/object files generated by
+# greenhills compiler
+cat configure | sed -e "s,\*.xcoff,\*.xcoff | *.dla | *.dnm | *.dbo | *.map," > configure.tmp
+mv configure.tmp configure
+chmod +x configure
+
if test -f "vendor/atlas/autogen.sh"; then
cd vendor/atlas
./autogen.sh
Index: configure.ac
===================================================================
RCS file: /home/cvs/Repository/vpp/configure.ac,v
retrieving revision 1.82
diff -u -r1.82 configure.ac
--- configure.ac 17 Feb 2006 20:23:44 -0000 1.82
+++ configure.ac 28 Feb 2006 22:17:49 -0000
@@ -240,6 +240,11 @@
[set CPU speed in MHz. Only necessary for TSC and if /proc/cpuinfo does not exist or is wrong]),,
[enable_cpu_mhz=none])
+AC_ARG_WITH([simd],
+ AS_HELP_STRING([--with-simd=WHAT],
+ [set SIMD extensions]),,
+ [with_simd=none])
+
#
# Files to generate.
@@ -981,10 +986,12 @@
ippcore_search="ippcore ippcoreem64t"
ipps_search="ipps ippsem64t"
ippi_search="ippi ippiem64t"
+ ippm_search="ippm ippmem64t"
else
ippcore_search="ippcore$with_ipp_suffix"
ipps_search="ipps$with_ipp_suffix"
ippi_search="ippi$with_ipp_suffix"
+ ippm_search="ippm$with_ipp_suffix"
fi
# Find the library.
save_LDFLAGS="$LDFLAGS"
@@ -1032,6 +1039,12 @@
[have_ippi="no"
LD_FLAGS="$save_LDFLAGS"])
+ AC_SEARCH_LIBS(
+ [ippmCopy_ma_32f_SS], [$ippm_search],
+ [have_ippm="yes"],
+ [have_ippm="no"
+ LD_FLAGS="$save_LDFLAGS"])
+
if test "$enable_ipp_fft" != "no"; then
AC_SUBST(VSIP_IMPL_IPP_FFT, 1)
AC_DEFINE_UNQUOTED(VSIP_IMPL_IPP_FFT, 1,
@@ -1487,6 +1500,67 @@
fi
#
+# Configure SIMD extensions
+#
+if test "$with_simd" != "none"; then
+ keep_IFS=$IFS
+ IFS=","
+
+ taglist=""
+
+ for simd_type in $with_simd; do
+ AC_MSG_CHECKING([SIMD Tag $simd_type])
+ if test "$simd_type" == "3dnowext-32"; then
+ taglist="${taglist}Simd_3dnowext_tag,"
+ AC_SUBST(USE_SIMD_3DNOWEXT_32, 1)
+ AC_DEFINE_UNQUOTED(VSIP_IMPL_HAVE_SIMD_3DNOWEXT, 1,
+ [Define whether to use 3DNow!-ext ISA routines in expr dispatch.])
+ AC_MSG_RESULT([ok])
+
+ elif test "$simd_type" == "3dnowext-64"; then
+ taglist="${taglist}Simd_3dnowext_tag,"
+ AC_SUBST(USE_SIMD_3DNOWEXT_64, 1)
+ AC_DEFINE_UNQUOTED(VSIP_IMPL_HAVE_SIMD_3DNOWEXT, 1,
+ [Define whether to use 3DNow!-ext ISA routines in expr dispatch.])
+ AC_MSG_RESULT([ok])
+
+ elif test "$simd_type" == "sse2-32"; then
+ taglist="${taglist}Simd_sse2_tag,"
+ AC_SUBST(USE_SIMD_SSE2_32, 1)
+ AC_DEFINE_UNQUOTED(VSIP_IMPL_HAVE_SIMD_SSE2, 1,
+ [Define whether to use SSE2 ISA routines in expr dispatch.])
+ AC_MSG_RESULT([ok])
+
+ elif test "$simd_type" == "sse2-64"; then
+ taglist="${taglist}Simd_sse2_tag,"
+ AC_SUBST(USE_SIMD_SSE2_64, 1)
+ AC_DEFINE_UNQUOTED(VSIP_IMPL_HAVE_SIMD_SSE2, 1,
+ [Define whether to use SSE2 ISA routines in expr dispatch.])
+ AC_MSG_RESULT([ok])
+
+ elif test "$simd_type" == "generic"; then
+ taglist="${taglist}Simd_tag,"
+ AC_DEFINE_UNQUOTED(VSIP_IMPL_HAVE_SIMD_GENERIC, 1,
+ [Define whether to use Generic SIMD routines in expr dispatch.])
+ AC_MSG_RESULT([ok])
+
+ else
+ AC_MSG_ERROR([Invalid tag to --with-simd=SIMD: $simd_type.])
+ fi
+ done
+
+ IFS=$keep_IFS
+else
+ taglist=""
+fi
+
+AC_DEFINE_UNQUOTED(VSIP_IMPL_SIMD_TAG_LIST, $taglist,
+ [Define to set whether or not to use Intel's IPP library.])
+
+mkdir -p src/vsip/impl/simd
+
+
+#
# library
#
ARFLAGS="r"
Index: apps/sarsim/histcmp.c
===================================================================
RCS file: /home/cvs/Repository/vpp/apps/sarsim/histcmp.c,v
retrieving revision 1.3
diff -u -r1.3 histcmp.c
--- apps/sarsim/histcmp.c 10 Sep 2005 17:59:24 -0000 1.3
+++ apps/sarsim/histcmp.c 28 Feb 2006 22:17:50 -0000
@@ -28,6 +28,8 @@
#include "sarx.h"
#include "util_io.h"
+#define min(a,b) ((a)<(b)?(a):(b))
+
main(argc,argv)
int argc;
char **argv;
Index: apps/sarsim/util_io.c
===================================================================
RCS file: /home/cvs/Repository/vpp/apps/sarsim/util_io.c,v
retrieving revision 1.1
diff -u -r1.1 util_io.c
--- apps/sarsim/util_io.c 16 Jun 2005 18:01:20 -0000 1.1
+++ apps/sarsim/util_io.c 28 Feb 2006 22:17:51 -0000
@@ -27,7 +27,7 @@
size_t nmemb,
FILE *stream)
{
- int i;
+ size_t i;
assert(size*nmemb < BUF_SIZE);
@@ -79,9 +79,9 @@
FILE *stream,
int align)
{
- int i;
+ size_t i;
- int n_align = size*nmemb/align;
+ size_t n_align = size*nmemb/align;
assert(n_align*align == size*nmemb);
@@ -131,7 +131,7 @@
size_t nmemb,
FILE *stream)
{
- int i;
+ size_t i;
if (size == 1) {
return fread(ptr, size, nmemb, stream);
@@ -168,9 +168,9 @@
FILE *stream,
int align)
{
- int i;
+ size_t i;
- int n_align = size*nmemb/align;
+ size_t n_align = size*nmemb/align;
assert(n_align*align == size*nmemb);
if (align == 1) {
Index: benchmarks/conv.cpp
===================================================================
RCS file: /home/cvs/Repository/vpp/benchmarks/conv.cpp,v
retrieving revision 1.3
diff -u -r1.3 conv.cpp
--- benchmarks/conv.cpp 26 Sep 2005 20:11:05 -0000 1.3
+++ benchmarks/conv.cpp 28 Feb 2006 22:17:51 -0000
@@ -27,38 +27,10 @@
using namespace vsip;
-/// Return a random value between -0.5 and +0.5
-
-template <typename T>
-struct Random
-{
- static T value() { return T(1.f * rand()/(RAND_MAX+1.0)) - T(0.5); }
-};
-
-/// Specialization for random complex value.
-
-template <typename T>
-struct Random<complex<T> >
-{
- static complex<T> value() {
- return complex<T>(Random<T>::value(), Random<T>::value());
- }
-};
-
-
-
-/// Fill a matrix with random values.
-
-template <typename T,
- typename Block>
-void
-randm(Matrix<T, Block> m)
-{
- for (index_type r=0; r<m.size(0); ++r)
- for (index_type c=0; c<m.size(1); ++c)
- m(r, c) = Random<T>::value();
-}
+/***********************************************************************
+ Definitions
+***********************************************************************/
template <support_region_type Supp,
typename T>
@@ -86,6 +58,7 @@
int riob_per_point(length_type) { return -1*sizeof(T); }
int wiob_per_point(length_type) { return -1*sizeof(T); }
+ int mem_per_point(length_type) { return 2*sizeof(T); }
void operator()(length_type size, length_type loop, float& time)
{
Index: benchmarks/conv_ipp.cpp
===================================================================
RCS file: /home/cvs/Repository/vpp/benchmarks/conv_ipp.cpp,v
retrieving revision 1.2
diff -u -r1.2 conv_ipp.cpp
--- benchmarks/conv_ipp.cpp 7 Sep 2005 12:19:30 -0000 1.2
+++ benchmarks/conv_ipp.cpp 28 Feb 2006 22:17:51 -0000
@@ -73,6 +73,7 @@
}
int riob_per_point(size_t) { return 1*sizeof(Ipp32f); }
int wiob_per_point(size_t) { return 1*sizeof(Ipp32f); }
+ int mem_per_point(size_t) { return 2*sizeof(Ipp32f); }
void operator()(size_t size, size_t loop, float& time)
{
Index: benchmarks/copy.cpp
===================================================================
RCS file: /home/cvs/Repository/vpp/benchmarks/copy.cpp,v
retrieving revision 1.1
diff -u -r1.1 copy.cpp
--- benchmarks/copy.cpp 7 Sep 2005 12:19:30 -0000 1.1
+++ benchmarks/copy.cpp 28 Feb 2006 22:17:51 -0000
@@ -41,6 +41,7 @@
int ops_per_point(length_type) { return 1; }
int riob_per_point(length_type) { return 1*sizeof(T); }
int wiob_per_point(length_type) { return 1*sizeof(T); }
+ int mem_per_point(length_type) { return 1*sizeof(T); }
void operator()(length_type size, length_type loop, float& time)
{
@@ -106,6 +107,7 @@
int ops_per_point(length_type) { return 1; }
int riob_per_point(length_type) { return 1*sizeof(T); }
int wiob_per_point(length_type) { return 1*sizeof(T); }
+ int mem_per_point(length_type) { return 1*sizeof(T); }
void operator()(length_type size, length_type loop, float& time)
{
@@ -169,6 +171,7 @@
int ops_per_point(length_type) { return 1; }
int riob_per_point(length_type) { return 1*sizeof(T); }
int wiob_per_point(length_type) { return 1*sizeof(T); }
+ int mem_per_point(length_type) { return 1*sizeof(T); }
void operator()(length_type size, length_type loop, float& time)
{
Index: benchmarks/fastconv.cpp
===================================================================
RCS file: /home/cvs/Repository/vpp/benchmarks/fastconv.cpp,v
retrieving revision 1.2
diff -u -r1.2 fastconv.cpp
--- benchmarks/fastconv.cpp 19 Dec 2005 16:08:55 -0000 1.2
+++ benchmarks/fastconv.cpp 28 Feb 2006 22:17:51 -0000
@@ -34,7 +34,7 @@
int
fft_ops(length_type len)
{
- return int(5 * std::log(len) / std::log(2));
+ return int(5 * std::log((float)len) / std::log(2.f));
}
template <typename T,
@@ -45,7 +45,9 @@
struct Impl1ip; // in-place, phased fast-convolution
struct Impl1pip1; // psuedo in-place (using in-place Fft), phased
struct Impl1pip2; // psuedo in-place (using out-of-place Fft), phased
-struct Impl2; // out-of-place (tmp), interleaved fast-convolution
+struct Impl2op; // out-of-place (tmp), interleaved fast-convolution
+struct Impl2ip; // in-place, interleaved fast-convolution
+struct Impl2ip_tmp; // in-place (w/tmp), interleaved fast-convolution
struct Impl2fv; // foreach_vector, interleaved fast-convolution
struct Impl1pip2_nopar;
@@ -54,7 +56,7 @@
{
float ops(length_type npulse, length_type nrange)
{
- float fft_ops = 5 * nrange * std::log(nrange) / std::log(2);
+ float fft_ops = 5 * nrange * std::log((float)nrange) / std::log(2.f);
float tot_ops = 2 * npulse * fft_ops + 6 * npulse * nrange;
return tot_ops;
}
@@ -99,7 +101,7 @@
// Create the FFT objects.
for_fftm_type for_fftm(Domain<2>(npulse, nrange), 1.0);
- inv_fftm_type inv_fftm(Domain<2>(npulse, nrange), 1.0/(nrange*npulse));
+ inv_fftm_type inv_fftm(Domain<2>(npulse, nrange), 1.0/nrange);
// Initialize
data = T();
@@ -155,7 +157,7 @@
// Create the FFT objects.
for_fftm_type for_fftm(Domain<2>(npulse, nrange), 1.0);
- inv_fftm_type inv_fftm(Domain<2>(npulse, nrange), 1.0/(nrange*npulse));
+ inv_fftm_type inv_fftm(Domain<2>(npulse, nrange), 1.0/nrange);
// Initialize
data = T();
@@ -384,11 +386,11 @@
/***********************************************************************
- Impl2: out-of-place (tmp), interleaved fast-convolution
+ Impl2op: out-of-place (tmp), interleaved fast-convolution
***********************************************************************/
template <typename T>
-struct t_fastconv_base<T, Impl2> : fastconv_ops
+struct t_fastconv_base<T, Impl2op> : fastconv_ops
{
void fastconv(length_type npulse, length_type nrange,
length_type loop, float& time)
@@ -456,6 +458,152 @@
/***********************************************************************
+ Impl2ip: in-place, interleaved fast-convolution
+***********************************************************************/
+
+template <typename T>
+struct t_fastconv_base<T, Impl2ip> : fastconv_ops
+{
+ void fastconv(length_type npulse, length_type nrange,
+ length_type loop, float& time)
+ {
+ typedef Map<Block_dist, Whole_dist> map_type;
+ typedef Dense<2, T, row2_type, map_type> block_type;
+ typedef Matrix<T, block_type> view_type;
+
+ typedef Dense<1, T, row1_type, Global_map<1> > replica_block_type;
+ typedef Vector<T, replica_block_type> replica_view_type;
+
+ processor_type np = num_processors();
+ map_type map = map_type(Block_dist(np), Whole_dist());
+
+ // Create the data cube.
+ view_type data(npulse, nrange, map);
+ // Vector<T> tmp(nrange);
+
+ // Create the pulse replica
+ replica_view_type replica(nrange);
+
+ // int const no_times = 0; // FFTW_PATIENT
+ int const no_times = 15; // not > 12 = FFT_MEASURE
+
+ typedef Fft<const_Vector, T, T, fft_fwd, by_reference, no_times>
+ for_fft_type;
+ typedef Fft<const_Vector, T, T, fft_inv, by_reference, no_times>
+ inv_fft_type;
+
+ // Create the FFT objects.
+ for_fft_type for_fft(Domain<1>(nrange), 1.0);
+ inv_fft_type inv_fft(Domain<1>(nrange), 1.0/(nrange));
+
+ // Initialize
+ data = T();
+ replica = T();
+
+
+ // Before fast convolution, convert the replica into the
+ // frequency domain
+ // for_fft(replica);
+
+ vsip::impl::profile::Timer t1;
+
+ t1.start();
+ for (index_type l=0; l<loop; ++l)
+ {
+ typename view_type::local_type l_data = data.local();
+ typename replica_view_type::local_type l_replica = replica.local();
+ length_type l_npulse = l_data.size(0);
+ for (index_type p=0; p<l_npulse; ++p)
+ {
+ for_fft(l_data.row(p));
+ l_data.row(p) *= l_replica;
+ inv_fft(l_data.row(p));
+ }
+ }
+ t1.stop();
+
+ // CHECK RESULT
+ time = t1.delta();
+ }
+};
+
+
+
+/***********************************************************************
+ Impl2ip_tmp: in-place, interleaved fast-convolution
+***********************************************************************/
+
+template <typename T>
+struct t_fastconv_base<T, Impl2ip_tmp> : fastconv_ops
+{
+ void fastconv(length_type npulse, length_type nrange,
+ length_type loop, float& time)
+ {
+ typedef Map<Block_dist, Whole_dist> map_type;
+ typedef Dense<2, T, row2_type, map_type> block_type;
+ typedef Matrix<T, block_type> view_type;
+
+ typedef Dense<1, T, row1_type, Global_map<1> > replica_block_type;
+ typedef Vector<T, replica_block_type> replica_view_type;
+
+ processor_type np = num_processors();
+ map_type map = map_type(Block_dist(np), Whole_dist());
+
+ // Create the data cube.
+ view_type data(npulse, nrange, map);
+ Vector<T> tmp(nrange);
+
+ // Create the pulse replica
+ replica_view_type replica(nrange);
+
+ // int const no_times = 0; // FFTW_PATIENT
+ int const no_times = 15; // not > 12 = FFT_MEASURE
+
+ typedef Fft<const_Vector, T, T, fft_fwd, by_reference, no_times>
+ for_fft_type;
+ typedef Fft<const_Vector, T, T, fft_inv, by_reference, no_times>
+ inv_fft_type;
+
+ // Create the FFT objects.
+ for_fft_type for_fft(Domain<1>(nrange), 1.0);
+ inv_fft_type inv_fft(Domain<1>(nrange), 1.0/(nrange));
+
+ // Initialize
+ data = T();
+ replica = T();
+
+
+ // Before fast convolution, convert the replica into the
+ // frequency domain
+ // for_fft(replica);
+
+ vsip::impl::profile::Timer t1;
+
+ t1.start();
+ for (index_type l=0; l<loop; ++l)
+ {
+ typename view_type::local_type l_data = data.local();
+ typename replica_view_type::local_type l_replica = replica.local();
+ length_type l_npulse = l_data.size(0);
+ for (index_type p=0; p<l_npulse; ++p)
+ {
+ tmp = l_data.row(p);
+ for_fft(tmp);
+ tmp *= l_replica;
+ inv_fft(tmp);
+ l_data.row(p) = tmp;
+ }
+ }
+ t1.stop();
+
+ // CHECK RESULT
+ time = t1.delta();
+ }
+};
+
+
+
+/***********************************************************************
Impl2fv: foreach_vector, interleaved fast-convolution
***********************************************************************/
@@ -555,8 +703,9 @@
char* what() { return "t_fastconv_pf"; }
int ops_per_point(length_type size)
{ return (int)(this->ops(npulse_, size) / size); }
- int riob_per_point(length_type) { return -1*sizeof(T); }
- int wiob_per_point(length_type) { return -1*sizeof(T); }
+ int riob_per_point(length_type) { return -1*(int)sizeof(T); }
+ int wiob_per_point(length_type) { return -1*(int)sizeof(T); }
+ int mem_per_point(length_type size) { return npulse_*sizeof(T); }
void operator()(length_type size, length_type loop, float& time)
{
@@ -581,8 +730,9 @@
char* what() { return "t_fastconv_rf"; }
int ops_per_point(length_type size)
{ return (int)(this->ops(size, nrange_) / size); }
- int riob_per_point(length_type) { return -1*sizeof(T); }
- int wiob_per_point(length_type) { return -1*sizeof(T); }
+ int riob_per_point(length_type) { return -1*(int)sizeof(T); }
+ int wiob_per_point(length_type) { return -1*(int)sizeof(T); }
+ int mem_per_point(length_type size) { return nrange_*sizeof(T); }
void operator()(length_type size, length_type loop, float& time)
{
@@ -619,8 +769,10 @@
case 2: loop(t_fastconv_pf<complex<float>, Impl1ip>(param1)); break;
case 3: loop(t_fastconv_pf<complex<float>, Impl1pip1>(param1)); break;
case 4: loop(t_fastconv_pf<complex<float>, Impl1pip2>(param1)); break;
- case 5: loop(t_fastconv_pf<complex<float>, Impl2>(param1)); break;
- case 6: loop(t_fastconv_pf<complex<float>, Impl2fv>(param1)); break;
+ case 5: loop(t_fastconv_pf<complex<float>, Impl2op>(param1)); break;
+ case 6: loop(t_fastconv_pf<complex<float>, Impl2ip>(param1)); break;
+ case 7: loop(t_fastconv_pf<complex<float>, Impl2ip_tmp>(param1)); break;
+ case 8: loop(t_fastconv_pf<complex<float>, Impl2fv>(param1)); break;
case 9: loop(t_fastconv_pf<complex<float>, Impl1pip2_nopar>(param1)); break;
@@ -628,8 +780,10 @@
case 12: loop(t_fastconv_rf<complex<float>, Impl1ip>(param1)); break;
case 13: loop(t_fastconv_rf<complex<float>, Impl1pip1>(param1)); break;
case 14: loop(t_fastconv_rf<complex<float>, Impl1pip2>(param1)); break;
- case 15: loop(t_fastconv_rf<complex<float>, Impl2>(param1)); break;
- case 16: loop(t_fastconv_rf<complex<float>, Impl2fv>(param1)); break;
+ case 15: loop(t_fastconv_rf<complex<float>, Impl2op>(param1)); break;
+ case 16: loop(t_fastconv_rf<complex<float>, Impl2ip>(param1)); break;
+ case 17: loop(t_fastconv_rf<complex<float>, Impl2ip_tmp>(param1)); break;
+ case 18: loop(t_fastconv_rf<complex<float>, Impl2fv>(param1)); break;
default: return 0;
}
Index: benchmarks/fft.cpp
===================================================================
RCS file: /home/cvs/Repository/vpp/benchmarks/fft.cpp,v
retrieving revision 1.3
diff -u -r1.3 fft.cpp
--- benchmarks/fft.cpp 26 Sep 2005 20:11:05 -0000 1.3
+++ benchmarks/fft.cpp 28 Feb 2006 22:17:51 -0000
@@ -28,7 +28,7 @@
int
fft_ops(length_type len)
{
- return int(5 * std::log(len) / std::log(2));
+ return int(5 * std::log((float)len) / std::log(2.f));
}
@@ -37,8 +37,9 @@
{
char* what() { return "t_fft_op"; }
int ops_per_point(length_type len) { return fft_ops(len); }
- int riob_per_point(length_type) { return -1*sizeof(T); }
- int wiob_per_point(length_type) { return -1*sizeof(T); }
+ int riob_per_point(length_type) { return -1*(int)sizeof(T); }
+ int wiob_per_point(length_type) { return -1*(int)sizeof(T); }
+ int mem_per_point(length_type) { return 1*sizeof(T); }
void operator()(length_type size, length_type loop, float& time)
{
@@ -51,7 +52,7 @@
typedef Fft<const_Vector, T, T, fft_fwd, by_reference, no_times, alg_time>
fft_type;
- fft_type fft(Domain<1>(size), 1.f);
+ fft_type fft(Domain<1>(size), scale_ ? (1.f/size) : 1.f);
A = T(1);
@@ -62,7 +63,7 @@
fft(A, Z);
t1.stop();
- if (!equal(Z(0), T(size)))
+ if (!equal(Z(0), T(scale_ ? 1 : size)))
{
std::cout << "t_fft_op: ERROR" << std::endl;
abort();
@@ -70,6 +71,11 @@
time = t1.delta();
}
+
+ t_fft_op(bool scale) : scale_(scale) {}
+
+ // Member data
+ bool scale_;
};
@@ -79,8 +85,9 @@
{
char* what() { return "t_fft_ip"; }
int ops_per_point(length_type len) { return fft_ops(len); }
- int riob_per_point(length_type) { return -1*sizeof(T); }
- int wiob_per_point(length_type) { return -1*sizeof(T); }
+ int riob_per_point(length_type) { return -1*(int)sizeof(T); }
+ int wiob_per_point(length_type) { return -1*(int)sizeof(T); }
+ int mem_per_point(length_type) { return 1*sizeof(T); }
void operator()(length_type size, length_type loop, float& time)
{
@@ -92,7 +99,7 @@
typedef Fft<const_Vector, T, T, fft_fwd, by_reference, no_times, alg_time>
fft_type;
- fft_type fft(Domain<1>(size), 1.f);
+ fft_type fft(Domain<1>(size), scale_ ? (1.f/size) : 1.f);
vsip::impl::profile::Timer t1;
@@ -109,6 +116,11 @@
time = t1.delta();
}
+
+ t_fft_ip(bool scale) : scale_(scale) {}
+
+ // Member data
+ bool scale_;
};
@@ -126,8 +138,11 @@
{
switch (what)
{
- case 1: loop(t_fft_op<complex<float> >()); break;
- case 2: loop(t_fft_ip<complex<float> >()); break;
+ case 1: loop(t_fft_op<complex<float> >(false)); break;
+ case 2: loop(t_fft_ip<complex<float> >(false)); break;
+
+ case 5: loop(t_fft_op<complex<float> >(true)); break;
+ case 6: loop(t_fft_ip<complex<float> >(true)); break;
default: return 0;
}
return 1;
Index: benchmarks/fft_ipp.cpp
===================================================================
RCS file: /home/cvs/Repository/vpp/benchmarks/fft_ipp.cpp,v
retrieving revision 1.2
diff -u -r1.2 fft_ipp.cpp
--- benchmarks/fft_ipp.cpp 7 Sep 2005 12:19:30 -0000 1.2
+++ benchmarks/fft_ipp.cpp 28 Feb 2006 22:17:51 -0000
@@ -50,6 +50,7 @@
int ops_per_point(size_t len) { return fft_ops(len); }
int riob_per_point(size_t) { return -1*sizeof(Ipp32fc); }
int wiob_per_point(size_t) { return -1*sizeof(Ipp32fc); }
+ int mem_per_point(size_t) { return 2*sizeof(Ipp32fc); }
void operator()(size_t size, size_t loop, float& time)
{
Index: benchmarks/loop.hpp
===================================================================
RCS file: /home/cvs/Repository/vpp/benchmarks/loop.hpp,v
retrieving revision 1.10
diff -u -r1.10 loop.hpp
--- benchmarks/loop.hpp 14 Feb 2006 13:42:51 -0000 1.10
+++ benchmarks/loop.hpp 28 Feb 2006 22:17:51 -0000
@@ -34,10 +34,24 @@
pts_per_sec,
ops_per_sec,
iob_per_sec,
+ wiob_per_sec,
all_per_sec
};
+enum lhs_metric
+{
+ lhs_pts,
+ lhs_mem
+};
+
+enum bench_mode
+{
+ steady_mode,
+ sweep_mode
+};
+
+
// 1 Parameter Loop
class Loop1P
{
@@ -50,17 +64,26 @@
start_ (2),
stop_ (21),
cal_ (4),
- loop_start_ (20),
+ loop_start_ (10),
samples_ (1),
goal_sec_ (1.0),
metric_ (pts_per_sec),
+ lhs_ (lhs_pts),
note_ (0),
do_prof_ (false),
what_ (0),
- show_loop_ (false)
+ show_loop_ (false),
+ show_time_ (false),
+ mode_ (sweep_mode)
{}
template <typename Functor>
+ void sweep(Functor func);
+
+ template <typename Functor>
+ void steady(Functor func);
+
+ template <typename Functor>
void operator()(Functor func);
template <typename Functor>
@@ -76,11 +99,14 @@
unsigned samples_;
double goal_sec_; // measurement goal (in seconds)
output_metric metric_;
+ lhs_metric lhs_;
char* note_;
int user_param_;
bool do_prof_;
int what_;
bool show_loop_;
+ bool show_time_;
+ bench_mode mode_;
};
@@ -114,6 +140,11 @@
* loop;
return ops / (time * 1e6);
}
+ else if (m == wiob_per_sec)
+ {
+ double ops = (double)M * fcn.wiob_per_point(M) * loop;
+ return ops / (time * 1e6);
+ }
else
return 0.f;
}
@@ -122,8 +153,7 @@
template <typename Functor>
inline void
-Loop1P::operator()(
- Functor fcn)
+Loop1P::sweep(Functor fcn)
{
using vsip::Index;
using vsip::Vector;
@@ -154,9 +184,20 @@
// calibrate --------------------------------------------------------
while (1)
{
+ // printf("%d: calib %5d\n", rank, loop);
+ comm.barrier();
fcn(M, loop, time);
+ comm.barrier();
+
+ dist_time.local().put(0, time);
+ glob_time = dist_time;
+
+ Index<1> idx;
+
+ time = maxval(glob_time.local(), idx);
+
if (time <= 0.01) time = 0.01;
- // if (!as_data) printf("c: %5d - %5d\n", loop, time);
+ // printf("%d: time %f\n", rank, time);
float factor = goal_sec_ / time;
if (factor < 1.0) factor += 0.1 * (1.0 - factor);
@@ -173,10 +214,12 @@
printf("# ops_per_point(1) : %d\n", (int)fcn.ops_per_point(1));
printf("# riob_per_point(1): %d\n", fcn.riob_per_point(1));
printf("# wiob_per_point(1): %d\n", fcn.wiob_per_point(1));
- printf("# metric : %s\n", metric_ == pts_per_sec ? "pts_per_sec" :
- metric_ == ops_per_sec ? "ops_per_sec" :
- metric_ == iob_per_sec ? "iob_per_sec" :
- "*unknown*");
+ printf("# metric : %s\n",
+ metric_ == pts_per_sec ? "pts_per_sec" :
+ metric_ == ops_per_sec ? "ops_per_sec" :
+ metric_ == iob_per_sec ? "iob_per_sec" :
+ metric_ == wiob_per_sec ? "wiob_per_sec" :
+ "*unknown*");
if (this->note_)
printf("# note: %s\n", this->note_);
printf("# start_loop : %lu\n", (unsigned long) loop);
@@ -215,23 +258,33 @@
if (rank == 0)
{
+ size_t L;
+
+ if (this->lhs_ == lhs_mem)
+ L = M * fcn.mem_per_point(M);
+ else // (this->lhs_ == lhs_pts)
+ L = M;
+
if (this->metric_ == all_per_sec)
- printf("%7ld %f %f %f", (unsigned long) M,
+ printf("%7ld %f %f %f", (unsigned long) L,
this->metric(fcn, M, loop, mtime[(n_time-1)/2], pts_per_sec),
this->metric(fcn, M, loop, mtime[(n_time-1)/2], ops_per_sec),
this->metric(fcn, M, loop, mtime[(n_time-1)/2], iob_per_sec));
else if (n_time > 1)
// Note: max time is min op/s, and min time is max op/s
- printf("%7lu %f %f %f", (unsigned long) M,
+ printf("%7lu %f %f %f", (unsigned long) L,
this->metric(fcn, M, loop, mtime[(n_time-1)/2], metric_),
this->metric(fcn, M, loop, mtime[n_time-1], metric_),
this->metric(fcn, M, loop, mtime[0], metric_));
else
- printf("%7lu %f", (unsigned long) M,
+ printf("%7lu %f", (unsigned long) L,
this->metric(fcn, M, loop, mtime[0], metric_));
if (this->show_loop_)
printf(" %8lu", (unsigned long)loop);
+ if (this->show_time_)
+ printf(" %f", mtime[(n_time-1)/2]);
printf("\n");
+ fflush(stdout);
}
time = mtime[(n_time-1)/2];
@@ -247,4 +300,115 @@
}
}
+
+
+template <typename Functor>
+void
+Loop1P::steady(Functor fcn)
+{
+ using vsip::Index;
+ using vsip::Vector;
+ using vsip::Dense;
+ using vsip::Map;
+ using vsip::Global_map;
+ using vsip::row1_type;
+
+ size_t loop, M;
+ float time;
+
+ vsip::impl::Communicator comm = vsip::impl::default_communicator();
+ vsip::processor_type rank = comm.rank();
+ vsip::processor_type nproc = vsip::num_processors();
+
+ Vector<float, Dense<1, float, row1_type, Map<> > >
+ dist_time(nproc, Map<>(nproc));
+ Vector<float, Dense<1, float, row1_type, Global_map<1> > > glob_time(nproc);
+
+ loop = (1 << loop_start_);
+
+ if (rank == 0)
+ {
+ printf("# what : %s (%d)\n", fcn.what(), what_);
+ printf("# nproc : %d\n", nproc);
+ printf("# ops_per_point(1) : %d\n", (int)fcn.ops_per_point(1));
+ printf("# riob_per_point(1): %d\n", fcn.riob_per_point(1));
+ printf("# wiob_per_point(1): %d\n", fcn.wiob_per_point(1));
+ printf("# metric : %s\n",
+ metric_ == pts_per_sec ? "pts_per_sec" :
+ metric_ == ops_per_sec ? "ops_per_sec" :
+ metric_ == iob_per_sec ? "iob_per_sec" :
+ metric_ == wiob_per_sec ? "wiob_per_sec" :
+ "*unknown*");
+ if (this->note_)
+ printf("# note: %s\n", this->note_);
+ printf("# start_loop : %lu\n", (unsigned long) loop);
+ }
+
+ if (this->do_prof_)
+ vsip::impl::profile::prof->set_mode(vsip::impl::profile::pm_accum);
+
+ // for real ---------------------------------------------------------
+ while (1)
+ {
+ M = (1 << start_);
+
+ comm.barrier();
+ fcn(M, loop, time);
+ comm.barrier();
+
+ dist_time.local().put(0, time);
+ glob_time = dist_time;
+
+ Index<1> idx;
+
+ time = maxval(glob_time.local(), idx);
+
+#if 0
+ if (this->do_prof_)
+ {
+ char filename[256];
+ sprintf(filename, "vprof.%lu.out", (unsigned long) M);
+ vsip::impl::profile::prof->dump(filename);
+ }
+#endif
+
+ if (rank == 0)
+ {
+ if (this->metric_ == all_per_sec)
+ printf("%7ld %f %f %f", (unsigned long) M,
+ this->metric(fcn, M, loop, time, pts_per_sec),
+ this->metric(fcn, M, loop, time, ops_per_sec),
+ this->metric(fcn, M, loop, time, iob_per_sec));
+ else
+ printf("%7lu %f", (unsigned long) M,
+ this->metric(fcn, M, loop, time, metric_));
+ if (this->show_loop_)
+ printf(" %8lu", (unsigned long)loop);
+ if (this->show_time_)
+ printf(" %f", time);
+ printf("\n");
+ fflush(stdout);
+ }
+
+ float factor = goal_sec_ / time;
+ if (factor < 1.0) factor += 0.1 * (1.0 - factor);
+ loop = (int)(factor * loop);
+
+ if (loop < 1) loop = 1;
+ }
+}
+
+
+
+template <typename Functor>
+inline void
+Loop1P::operator()(
+ Functor fcn)
+{
+ if (mode_ == steady_mode)
+ this->steady(fcn);
+ else
+ this->sweep(fcn);
+}
+
#endif // CSL_LOOP_HPP
Index: benchmarks/main.cpp
===================================================================
RCS file: /home/cvs/Repository/vpp/benchmarks/main.cpp,v
retrieving revision 1.6
diff -u -r1.6 main.cpp
--- benchmarks/main.cpp 27 Jan 2006 13:13:23 -0000 1.6
+++ benchmarks/main.cpp 28 Feb 2006 22:17:51 -0000
@@ -53,7 +53,10 @@
if (sscanf(argv[i], "-%d", &what))
;
else if (!strcmp(argv[i], "-start"))
+ {
loop.start_ = atoi(argv[++i]);
+ loop.cal_ = loop.start_;
+ }
else if (!strcmp(argv[i], "-stop"))
loop.stop_ = atoi(argv[++i]);
else if (!strcmp(argv[i], "-cal"))
@@ -74,12 +77,22 @@
loop.metric_ = pts_per_sec;
else if (!strcmp(argv[i], "-iob"))
loop.metric_ = iob_per_sec;
+ else if (!strcmp(argv[i], "-wiob"))
+ loop.metric_ = wiob_per_sec;
else if (!strcmp(argv[i], "-all"))
loop.metric_ = all_per_sec;
+ else if (!strcmp(argv[i], "-mem"))
+ loop.lhs_ = lhs_mem;
else if (!strcmp(argv[i], "-prof"))
loop.do_prof_ = true;
else if (!strcmp(argv[i], "-show_loop"))
loop.show_loop_ = true;
+ else if (!strcmp(argv[i], "-show_time"))
+ loop.show_time_ = true;
+ else if (!strcmp(argv[i], "-steady"))
+ loop.mode_ = steady_mode;
+ else
+ std::cerr << "ERROR: Unknown argument: " << argv[i] << std::endl;
}
if (verbose)
Index: benchmarks/mcopy.cpp
===================================================================
RCS file: /home/cvs/Repository/vpp/benchmarks/mcopy.cpp,v
retrieving revision 1.2
diff -u -r1.2 mcopy.cpp
--- benchmarks/mcopy.cpp 22 Dec 2005 01:29:25 -0000 1.2
+++ benchmarks/mcopy.cpp 28 Feb 2006 22:17:51 -0000
@@ -20,6 +20,7 @@
#include <vsip/impl/par-chain-assign.hpp>
#include <vsip/map.hpp>
#include <vsip/impl/profile.hpp>
+#include <vsip/impl/metaprogramming.hpp>
#include "test.hpp"
#include "loop.hpp"
@@ -29,6 +30,36 @@
using namespace vsip;
+using vsip::impl::ITE_Type;
+using vsip::impl::As_type;
+
+
+
+/***********************************************************************
+ Matrix copy - normal assignment
+***********************************************************************/
+
+template <typename T,
+ typename DstBlock,
+ typename SrcBlock>
+void
+matrix_copy(
+ Matrix<T, DstBlock> dst,
+ Matrix<T, SrcBlock> src)
+{
+ dst = src;
+}
+
+struct Impl_assign;
+struct Impl_sa;
+struct Impl_memcpy;
+
+template <typename T,
+ typename SrcBlock,
+ typename DstBlock,
+ typename ImplTag>
+struct t_mcopy;
+
/***********************************************************************
@@ -38,17 +69,19 @@
template <typename T,
typename SrcBlock,
typename DstBlock>
-struct t_mcopy
+struct t_mcopy<T, SrcBlock, DstBlock, Impl_assign>
{
typedef typename SrcBlock::map_type src_map_type;
typedef typename DstBlock::map_type dst_map_type;
- char* what() { return "t_mcopy"; }
+ char* what() { return "t_mcopy<T, SrcBlock, DstBlock, Impl_assign>"; }
int ops_per_point(length_type size) { return size; }
int riob_per_point(length_type size) { return size*sizeof(T); }
int wiob_per_point(length_type size) { return size*sizeof(T); }
+ int mem_per_point(length_type size) { return size*sizeof(T); }
void operator()(length_type size, length_type loop, float& time)
+ VSIP_IMPL_NOINLINE
{
length_type const M = size;
length_type const N = size;
@@ -95,16 +128,160 @@
+/***********************************************************************
+ Matrix copy - setup assignment
+***********************************************************************/
+
+template <typename T,
+ typename SrcBlock,
+ typename DstBlock>
+struct t_mcopy<T, SrcBlock, DstBlock, Impl_sa>
+{
+ typedef typename SrcBlock::map_type src_map_type;
+ typedef typename DstBlock::map_type dst_map_type;
+
+ char* what() { return "t_mcopy<T, SrcBlock, DstBlock, Impl_sa>"; }
+ int ops_per_point(length_type size) { return size; }
+ int riob_per_point(length_type size) { return size*sizeof(T); }
+ int wiob_per_point(length_type size) { return size*sizeof(T); }
+ int mem_per_point(length_type size) { return size*sizeof(T); }
+
+ void operator()(length_type size, length_type loop, float& time)
+ VSIP_IMPL_NOINLINE
+ {
+ length_type const M = size;
+ length_type const N = size;
+
+ Matrix<T, SrcBlock> A(M, N, T(), src_map_);
+ Matrix<T, DstBlock> Z(M, N, dst_map_);
+
+ Setup_assign expr(Z, A);
+
+ for (index_type m=0; m<M; ++m)
+ for (index_type n=0; n<N; ++n)
+ {
+ A.put(m, n, T(m*N + n));
+ }
+
+ vsip::impl::profile::Timer t1;
+
+ t1.start();
+ for (index_type l=0; l<loop; ++l)
+ expr();
+ t1.stop();
+
+ for (index_type m=0; m<M; ++m)
+ for (index_type n=0; n<N; ++n)
+ {
+ if (!equal(Z.get(m, n), T(m*N+n)))
+ {
+ std::cout << "t_mcopy: ERROR" << std::endl;
+ abort();
+ }
+ }
+
+ time = t1.delta();
+ }
+
+ t_mcopy(src_map_type src_map, dst_map_type dst_map)
+ : src_map_(src_map),
+ dst_map_(dst_map)
+ {}
+
+ // Member data.
+ src_map_type src_map_;
+ dst_map_type dst_map_;
+};
+
+
+
+/***********************************************************************
+ Matrix copy - using memcpy
+***********************************************************************/
+
+template <typename T,
+ typename SrcBlock,
+ typename DstBlock>
+struct t_mcopy<T, SrcBlock, DstBlock, Impl_memcpy>
+{
+ typedef typename SrcBlock::map_type src_map_type;
+ typedef typename DstBlock::map_type dst_map_type;
+
+ char* what() { return "t_mcopy<T, SrcBlock, DstBlock, Impl_memcpy>"; }
+ int ops_per_point(length_type size) { return size; }
+ int riob_per_point(length_type size) { return size*sizeof(T); }
+ int wiob_per_point(length_type size) { return size*sizeof(T); }
+ int mem_per_point(length_type size) { return size*sizeof(T); }
+
+ void operator()(length_type size, length_type loop, float& time)
+ VSIP_IMPL_NOINLINE
+ {
+ length_type const M = size;
+ length_type const N = size;
+
+ Matrix<T, SrcBlock> A(M, N, T(), src_map_);
+ Matrix<T, DstBlock> Z(M, N, dst_map_);
+
+ for (index_type m=0; m<M; ++m)
+ for (index_type n=0; n<N; ++n)
+ {
+ A.put(m, n, T(m*N + n));
+ }
+
+ vsip::impl::profile::Timer t1;
+
+ {
+ impl::Ext_data<SrcBlock> src_ext(A.block());
+ impl::Ext_data<DstBlock> dst_ext(Z.block());
+
+ t1.start();
+ for (index_type l=0; l<loop; ++l)
+ memcpy(dst_ext.data(), src_ext.data(), M*N*sizeof(T));
+ t1.stop();
+ }
+
+ for (index_type m=0; m<M; ++m)
+ for (index_type n=0; n<N; ++n)
+ {
+ if (!equal(Z.get(m, n), T(m*N+n)))
+ {
+ std::cout << "t_mcopy: ERROR" << std::endl;
+ abort();
+ }
+ }
+
+ time = t1.delta();
+ }
+
+ t_mcopy(src_map_type src_map, dst_map_type dst_map)
+ : src_map_(src_map),
+ dst_map_(dst_map)
+ {}
+
+ // Member data.
+ src_map_type src_map_;
+ dst_map_type dst_map_;
+};
+
+
+
+/***********************************************************************
+ Benchmark driver for local copy
+***********************************************************************/
+
template <typename T,
typename SrcOrder,
- typename DstOrder>
+ typename DstOrder,
+ typename ImplTag>
struct t_mcopy_local : t_mcopy<T,
Dense<2, T, SrcOrder, Local_map>,
- Dense<2, T, DstOrder, Local_map> >
+ Dense<2, T, DstOrder, Local_map>,
+ ImplTag>
{
typedef t_mcopy<T,
Dense<2, T, SrcOrder, Local_map>,
- Dense<2, T, DstOrder, Local_map> > base_type;
+ Dense<2, T, DstOrder, Local_map>,
+ ImplTag> base_type;
t_mcopy_local()
: base_type(Local_map(), Local_map())
{}
@@ -113,41 +290,73 @@
template <typename T,
typename SrcOrder,
- typename DstOrder>
+ typename DstOrder,
+ typename ImplTag>
struct t_mcopy_pb : t_mcopy<T,
Plain_block<2, T, SrcOrder, Local_map>,
- Plain_block<2, T, DstOrder, Local_map> >
+ Plain_block<2, T, DstOrder, Local_map>,
+ ImplTag>
{
typedef t_mcopy<T,
Plain_block<2, T, SrcOrder, Local_map>,
- Plain_block<2, T, DstOrder, Local_map> > base_type;
+ Plain_block<2, T, DstOrder, Local_map>,
+ ImplTag > base_type;
t_mcopy_pb()
: base_type(Local_map(), Local_map())
{}
};
-#if 0
+
+
+/***********************************************************************
+ Benchmark driver for distributed copy
+***********************************************************************/
+
template <typename T,
- typename SrcOrder,
- typename DstOrder>
-struct t_mcopy_root : t_mcopy<T,
- Dense<2, T, SrcOrder, MapT>,
- Dense<2, T, DstOrder, MapT>,
- Local_map>
+ int SrcDO,
+ int DstDO,
+ typename ImplTag>
+struct t_mcopy_par_helper
{
- typedef t_mcopy<T, SrcOrder, DstOrder, Map<> > base_type;
- typedef t_mcopy<T,
- Dense<2, T, SrcOrder, MapT>,
- Dense<2, T, DstOrder, MapT>,
- Map<> > base_type;
- t_mcopy_root()
- : base_type(Map<>())
+ typedef Map<Block_dist, Block_dist> map_type;
+
+ typedef typename ITE_Type<SrcDO == 0,
+ As_type<row2_type>, As_type<col2_type> >::type
+ src_order_type;
+ typedef Dense<2, T, src_order_type, map_type> src_block_type;
+
+ typedef typename ITE_Type<DstDO == 0,
+ As_type<row2_type>, As_type<col2_type> >::type
+ dst_order_type;
+ typedef Dense<2, T, dst_order_type, map_type> dst_block_type;
+
+ typedef t_mcopy<T, src_block_type, dst_block_type, ImplTag> base_type;
+};
+
+
+template <typename T,
+ int SrcDO,
+ int DstDO,
+ typename ImplTag>
+struct t_mcopy_par
+ : t_mcopy_par_helper<T, SrcDO, DstDO, ImplTag>::base_type
+{
+ typedef t_mcopy_par_helper<T, SrcDO, DstDO, ImplTag> helper_type;
+ typedef typename helper_type::map_type map_type;
+ typedef typename helper_type::base_type base_type;
+
+ t_mcopy_par(processor_type src_np, processor_type dst_np)
+ : base_type(SrcDO == 0 ? map_type(src_np, 1) : map_type(1, src_np),
+ DstDO == 0 ? map_type(dst_np, 1) : map_type(1, dst_np))
{}
};
-#endif
+/***********************************************************************
+ Main functions
+***********************************************************************/
+
void
defaults(Loop1P& loop)
{
@@ -159,23 +368,38 @@
int
test(Loop1P& loop, int what)
{
+ processor_type np = num_processors();
+
+ typedef row2_type rt;
+ typedef col2_type ct;
+
switch (what)
{
- case 1: loop(t_mcopy_local<float, row2_type, row2_type>()); break;
- case 2: loop(t_mcopy_local<float, row2_type, col2_type>()); break;
- case 3: loop(t_mcopy_local<float, col2_type, row2_type>()); break;
- case 4: loop(t_mcopy_local<float, col2_type, col2_type>()); break;
-
- case 5: loop(t_mcopy_local<complex<float>, row2_type, row2_type>()); break;
- case 6: loop(t_mcopy_local<complex<float>, row2_type, col2_type>()); break;
- case 7: loop(t_mcopy_local<complex<float>, col2_type, row2_type>()); break;
- case 8: loop(t_mcopy_local<complex<float>, col2_type, col2_type>()); break;
-
+ case 1: loop(t_mcopy_local<float, rt, rt, Impl_assign>()); break;
+ case 2: loop(t_mcopy_local<float, rt, ct, Impl_assign>()); break;
+ case 3: loop(t_mcopy_local<float, ct, rt, Impl_assign>()); break;
+ case 4: loop(t_mcopy_local<float, ct, ct, Impl_assign>()); break;
+
+ case 5: loop(t_mcopy_local<complex<float>, rt, rt, Impl_assign>()); break;
+ case 6: loop(t_mcopy_local<complex<float>, rt, ct, Impl_assign>()); break;
+ case 7: loop(t_mcopy_local<complex<float>, ct, rt, Impl_assign>()); break;
+ case 8: loop(t_mcopy_local<complex<float>, ct, ct, Impl_assign>()); break;
+
+ case 11: loop(t_mcopy_local<float, rt, rt, Impl_memcpy>()); break;
+ // case 12: loop(t_mcopy_local<float, rt, ct, Impl_memcpy>()); break;
+ // case 13: loop(t_mcopy_local<float, ct, rt, Impl_memcpy>()); break;
+ case 14: loop(t_mcopy_local<float, ct, ct, Impl_memcpy>()); break;
- case 20: loop(t_mcopy_pb<float, row2_type, row2_type>()); break;
- case 21: loop(t_mcopy_pb<float, row2_type, col2_type>()); break;
- case 22: loop(t_mcopy_pb<float, col2_type, row2_type>()); break;
- case 23: loop(t_mcopy_pb<float, col2_type, col2_type>()); break;
+ case 21: loop(t_mcopy_pb<float, rt, rt, Impl_assign>()); break;
+ case 22: loop(t_mcopy_pb<float, rt, ct, Impl_assign>()); break;
+ case 23: loop(t_mcopy_pb<float, ct, rt, Impl_assign>()); break;
+ case 24: loop(t_mcopy_pb<float, ct, ct, Impl_assign>()); break;
+
+ case 31: loop(t_mcopy_par<float, 0, 0, Impl_assign>(np, np)); break;
+ case 32: loop(t_mcopy_par<float, 0, 1, Impl_assign>(np, np)); break;
+
+ case 41: loop(t_mcopy_par<float, 0, 0, Impl_sa>(np, np)); break;
+ case 42: loop(t_mcopy_par<float, 0, 1, Impl_sa>(np, np)); break;
default:
return 0;
Index: benchmarks/mcopy_ipp.cpp
===================================================================
RCS file: benchmarks/mcopy_ipp.cpp
diff -N benchmarks/mcopy_ipp.cpp
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ benchmarks/mcopy_ipp.cpp 28 Feb 2006 22:17:51 -0000
@@ -0,0 +1,486 @@
+/* Copyright (c) 2006 by CodeSourcery, LLC. All rights reserved. */
+
+/** @file benchmarks/mcopy_ipp.cpp
+ @author Jules Bergmann
+ @date 2005-02-06
+ @brief VSIPL++ Library: Benchmark for matrix copy using IPP
+
+*/
+
+/***********************************************************************
+ Included Files
+***********************************************************************/
+
+#include <iostream>
+
+#include <ippm.h>
+
+#include <vsip/initfin.hpp>
+#include <vsip/support.hpp>
+#include <vsip/math.hpp>
+#include <vsip/impl/setup-assign.hpp>
+#include <vsip/impl/par-chain-assign.hpp>
+#include <vsip/map.hpp>
+#include <vsip/impl/profile.hpp>
+#include <vsip/impl/metaprogramming.hpp>
+
+#include "test.hpp"
+#include "output.hpp"
+#include "loop.hpp"
+#include "ops_info.hpp"
+
+#include "plainblock.hpp"
+
+using namespace vsip;
+
+using vsip::impl::ITE_Type;
+using vsip::impl::As_type;
+
+
+void
+ipp_copy(
+ float* src,
+ stride_type src_row_stride,
+ stride_type src_col_stride,
+ float* dst,
+ stride_type dst_row_stride,
+ stride_type dst_col_stride,
+ length_type rows,
+ length_type cols)
+{
+ ippmCopy_ma_32f_SS(
+ src, rows*cols,
+ src_row_stride*sizeof(float), src_col_stride*sizeof(float),
+ dst, rows*cols,
+ dst_row_stride*sizeof(float), dst_col_stride*sizeof(float),
+ cols, rows, 1);
+}
+
+
+
+void
+ipp_transpose(
+ float* src,
+ stride_type src_row_stride,
+ stride_type src_col_stride,
+ float* dst,
+ stride_type dst_row_stride,
+ stride_type dst_col_stride,
+ length_type rows,
+ length_type cols)
+{
+ assert(src_col_stride == 1);
+ assert(dst_col_stride == 1);
+ ippmTranspose_m_32f(
+ src, src_row_stride*sizeof(float),
+ cols, rows,
+ dst, dst_row_stride*sizeof(float)
+ );
+}
+
+
+
+/***********************************************************************
+ Matrix copy - normal assignment
+***********************************************************************/
+
+template <typename T,
+ typename DstBlock,
+ typename SrcBlock>
+void
+matrix_copy(
+ Matrix<T, DstBlock> dst,
+ Matrix<T, SrcBlock> src)
+{
+ dst = src;
+}
+
+struct Impl_copy;
+struct Impl_transpose;
+struct Impl_select;
+
+template <typename T,
+ typename SrcBlock,
+ typename DstBlock,
+ typename ImplTag>
+struct t_mcopy;
+
+
+
+/***********************************************************************
+ Matrix copy - using Copy
+***********************************************************************/
+
+template <typename T,
+ typename SrcBlock,
+ typename DstBlock>
+struct t_mcopy<T, SrcBlock, DstBlock, Impl_copy>
+{
+ typedef typename SrcBlock::map_type src_map_type;
+ typedef typename DstBlock::map_type dst_map_type;
+
+ char* what() { return "t_mcopy<T, SrcBlock, DstBlock, Impl_copy>"; }
+ int ops_per_point(length_type size) { return size; }
+ int riob_per_point(length_type size) { return size*sizeof(T); }
+ int wiob_per_point(length_type size) { return size*sizeof(T); }
+
+ void operator()(length_type size, length_type loop, float& time)
+ VSIP_IMPL_NOINLINE
+ {
+ length_type const M = size;
+ length_type const N = size;
+
+ Matrix<T, SrcBlock> A(M, N, T(), src_map_);
+ Matrix<T, DstBlock> Z(M, N, dst_map_);
+
+ for (index_type m=0; m<M; ++m)
+ for (index_type n=0; n<N; ++n)
+ {
+ A.put(m, n, T(m*N + n));
+ }
+
+ vsip::impl::profile::Timer t1;
+
+ {
+ impl::Ext_data<SrcBlock> src_ext(A.block());
+ impl::Ext_data<DstBlock> dst_ext(Z.block());
+
+ t1.start();
+ for (index_type l=0; l<loop; ++l)
+ ipp_copy(src_ext.data(), src_ext.stride(0), src_ext.stride(1),
+ dst_ext.data(), dst_ext.stride(0), dst_ext.stride(1),
+ M, N);
+ t1.stop();
+ }
+
+ for (index_type m=0; m<M; ++m)
+ for (index_type n=0; n<N; ++n)
+ {
+ if (!equal(Z.get(m, n), T(m*N+n)))
+ {
+ std::cout << "t_mcopy: ERROR" << std::endl;
+ std::cout << "A = " << A << std::endl;
+ std::cout << "Z = " << Z << std::endl;
+ abort();
+ }
+ }
+
+ time = t1.delta();
+ }
+
+ t_mcopy(src_map_type src_map, dst_map_type dst_map)
+ : src_map_(src_map),
+ dst_map_(dst_map)
+ {}
+
+ // Member data.
+ src_map_type src_map_;
+ dst_map_type dst_map_;
+};
+
+
+
+/***********************************************************************
+ Matrix copy - using Transpose
+***********************************************************************/
+
+template <typename T,
+ typename SrcBlock,
+ typename DstBlock>
+struct t_mcopy<T, SrcBlock, DstBlock, Impl_transpose>
+{
+ typedef typename SrcBlock::map_type src_map_type;
+ typedef typename DstBlock::map_type dst_map_type;
+
+ char* what() { return "t_mcopy<T, SrcBlock, DstBlock, Impl_transpose>"; }
+ int ops_per_point(length_type size) { return size; }
+ int riob_per_point(length_type size) { return size*sizeof(T); }
+ int wiob_per_point(length_type size) { return size*sizeof(T); }
+
+ void operator()(length_type size, length_type loop, float& time)
+ VSIP_IMPL_NOINLINE
+ {
+ length_type const M = size;
+ length_type const N = size;
+
+ Matrix<T, SrcBlock> A(M, N, T(), src_map_);
+ Matrix<T, DstBlock> Z(M, N, dst_map_);
+
+ for (index_type m=0; m<M; ++m)
+ for (index_type n=0; n<N; ++n)
+ {
+ A.put(m, n, T(m*N + n));
+ }
+
+ vsip::impl::profile::Timer t1;
+
+ {
+ impl::Ext_data<SrcBlock> src_ext(A.block());
+ impl::Ext_data<DstBlock> dst_ext(Z.block());
+
+ if (src_ext.stride(1) == 1 && dst_ext.stride(0) == 1)
+ {
+ t1.start();
+ for (index_type l=0; l<loop; ++l)
+ ipp_transpose(
+ src_ext.data(), src_ext.stride(0), src_ext.stride(1),
+ dst_ext.data(), dst_ext.stride(1), dst_ext.stride(0),
+ M, N);
+ t1.stop();
+ }
+ else if (src_ext.stride(0) == 1 && dst_ext.stride(1) == 1)
+ {
+ t1.start();
+ for (index_type l=0; l<loop; ++l)
+ ipp_transpose(
+ src_ext.data(), src_ext.stride(1), src_ext.stride(0),
+ dst_ext.data(), dst_ext.stride(0), dst_ext.stride(1),
+ N, M);
+ t1.stop();
+ }
+ else assert(0);
+ }
+
+ for (index_type m=0; m<M; ++m)
+ for (index_type n=0; n<N; ++n)
+ {
+ if (!equal(Z.get(m, n), T(m*N+n)))
+ {
+ std::cout << "t_mcopy: ERROR" << std::endl;
+ std::cout << "A = " << A << std::endl;
+ std::cout << "Z = " << Z << std::endl;
+ abort();
+ }
+ }
+
+ time = t1.delta();
+ }
+
+ t_mcopy(src_map_type src_map, dst_map_type dst_map)
+ : src_map_(src_map),
+ dst_map_(dst_map)
+ {}
+
+ // Member data.
+ src_map_type src_map_;
+ dst_map_type dst_map_;
+};
+
+
+
+/***********************************************************************
+ Matrix copy - using select
+***********************************************************************/
+
+template <typename T,
+ typename SrcBlock,
+ typename DstBlock>
+struct t_mcopy<T, SrcBlock, DstBlock, Impl_select>
+{
+ typedef typename SrcBlock::map_type src_map_type;
+ typedef typename DstBlock::map_type dst_map_type;
+
+ char* what() { return "t_mcopy<T, SrcBlock, DstBlock, Impl_copy>"; }
+ int ops_per_point(length_type size) { return size; }
+ int riob_per_point(length_type size) { return size*sizeof(T); }
+ int wiob_per_point(length_type size) { return size*sizeof(T); }
+
+ void operator()(length_type size, length_type loop, float& time)
+ VSIP_IMPL_NOINLINE
+ {
+ length_type const M = size;
+ length_type const N = size;
+
+ Matrix<T, SrcBlock> A(M, N, T(), src_map_);
+ Matrix<T, DstBlock> Z(M, N, dst_map_);
+
+ for (index_type m=0; m<M; ++m)
+ for (index_type n=0; n<N; ++n)
+ {
+ A.put(m, n, T(m*N + n));
+ }
+
+ vsip::impl::profile::Timer t1;
+
+ {
+ impl::Ext_data<SrcBlock> src_ext(A.block());
+ impl::Ext_data<DstBlock> dst_ext(Z.block());
+
+ if (src_ext.stride(1) == 1 && dst_ext.stride(0) == 1)
+ {
+ t1.start();
+ for (index_type l=0; l<loop; ++l)
+ ipp_transpose(
+ src_ext.data(), src_ext.stride(0), src_ext.stride(1),
+ dst_ext.data(), dst_ext.stride(1), dst_ext.stride(0),
+ M, N);
+ t1.stop();
+ }
+ else
+ {
+ t1.start();
+ for (index_type l=0; l<loop; ++l)
+ ipp_copy(src_ext.data(), src_ext.stride(0), src_ext.stride(1),
+ dst_ext.data(), dst_ext.stride(0), dst_ext.stride(1),
+ M, N);
+ t1.stop();
+ }
+ }
+
+ for (index_type m=0; m<M; ++m)
+ for (index_type n=0; n<N; ++n)
+ {
+ if (!equal(Z.get(m, n), T(m*N+n)))
+ {
+ std::cout << "t_mcopy: ERROR" << std::endl;
+ std::cout << "A = " << A << std::endl;
+ std::cout << "Z = " << Z << std::endl;
+ abort();
+ }
+ }
+
+ time = t1.delta();
+ }
+
+ t_mcopy(src_map_type src_map, dst_map_type dst_map)
+ : src_map_(src_map),
+ dst_map_(dst_map)
+ {}
+
+ // Member data.
+ src_map_type src_map_;
+ dst_map_type dst_map_;
+};
+
+
+
+/***********************************************************************
+ Benchmark driver for local copy
+***********************************************************************/
+
+template <typename T,
+ typename SrcOrder,
+ typename DstOrder,
+ typename ImplTag>
+struct t_mcopy_local : t_mcopy<T,
+ Dense<2, T, SrcOrder, Local_map>,
+ Dense<2, T, DstOrder, Local_map>,
+ ImplTag>
+{
+ typedef t_mcopy<T,
+ Dense<2, T, SrcOrder, Local_map>,
+ Dense<2, T, DstOrder, Local_map>,
+ ImplTag> base_type;
+ t_mcopy_local()
+ : base_type(Local_map(), Local_map())
+ {}
+};
+
+
+template <typename T,
+ typename SrcOrder,
+ typename DstOrder,
+ typename ImplTag>
+struct t_mcopy_pb : t_mcopy<T,
+ Plain_block<2, T, SrcOrder, Local_map>,
+ Plain_block<2, T, DstOrder, Local_map>,
+ ImplTag>
+{
+ typedef t_mcopy<T,
+ Plain_block<2, T, SrcOrder, Local_map>,
+ Plain_block<2, T, DstOrder, Local_map>,
+ ImplTag > base_type;
+ t_mcopy_pb()
+ : base_type(Local_map(), Local_map())
+ {}
+};
+
+
+
+/***********************************************************************
+ Benchmark driver for distributed copy
+***********************************************************************/
+
+template <typename T,
+ int SrcDO,
+ int DstDO,
+ typename ImplTag>
+struct t_mcopy_par_helper
+{
+ typedef Map<Block_dist, Block_dist> map_type;
+
+ typedef typename ITE_Type<SrcDO == 0,
+ As_type<row2_type>, As_type<col2_type> >::type
+ src_order_type;
+ typedef Dense<2, T, src_order_type, map_type> src_block_type;
+
+ typedef typename ITE_Type<DstDO == 0,
+ As_type<row2_type>, As_type<col2_type> >::type
+ dst_order_type;
+ typedef Dense<2, T, dst_order_type, map_type> dst_block_type;
+
+ typedef t_mcopy<T, src_block_type, dst_block_type, ImplTag> base_type;
+};
+
+
+template <typename T,
+ int SrcDO,
+ int DstDO,
+ typename ImplTag>
+struct t_mcopy_par
+ : t_mcopy_par_helper<T, SrcDO, DstDO, ImplTag>::base_type
+{
+ typedef t_mcopy_par_helper<T, SrcDO, DstDO, ImplTag> helper_type;
+ typedef typename helper_type::map_type map_type;
+ typedef typename helper_type::base_type base_type;
+
+ t_mcopy_par(processor_type src_np, processor_type dst_np)
+ : base_type(SrcDO == 0 ? map_type(src_np, 1) : map_type(1, src_np),
+ DstDO == 0 ? map_type(dst_np, 1) : map_type(1, dst_np))
+ {}
+};
+
+
+
+/***********************************************************************
+ Main functions
+***********************************************************************/
+
+void
+defaults(Loop1P& loop)
+{
+ loop.stop_ = 12;
+}
+
+
+
+int
+test(Loop1P& loop, int what)
+{
+ processor_type np = num_processors();
+
+ typedef row2_type rt;
+ typedef col2_type ct;
+
+ switch (what)
+ {
+ case 1: loop(t_mcopy_local<float, rt, rt, Impl_select>()); break;
+ case 2: loop(t_mcopy_local<float, rt, ct, Impl_select>()); break;
+ case 3: loop(t_mcopy_local<float, ct, rt, Impl_select>()); break;
+ case 4: loop(t_mcopy_local<float, ct, ct, Impl_select>()); break;
+
+ case 11: loop(t_mcopy_local<float, rt, rt, Impl_copy>()); break;
+ case 12: loop(t_mcopy_local<float, rt, ct, Impl_copy>()); break;
+ case 13: loop(t_mcopy_local<float, ct, rt, Impl_copy>()); break;
+ case 14: loop(t_mcopy_local<float, ct, ct, Impl_copy>()); break;
+
+ // case 21: loop(t_mcopy_local<float, rt, rt, Impl_transpose>()); break;
+ case 22: loop(t_mcopy_local<float, rt, ct, Impl_transpose>()); break;
+ case 23: loop(t_mcopy_local<float, ct, rt, Impl_transpose>()); break;
+ // case 24: loop(t_mcopy_local<float, ct, ct, Impl_transpose>()); break;
+
+ default:
+ return 0;
+ }
+ return 1;
+}
Index: benchmarks/mpi_alltoall.cpp
===================================================================
RCS file: benchmarks/mpi_alltoall.cpp
diff -N benchmarks/mpi_alltoall.cpp
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ benchmarks/mpi_alltoall.cpp 28 Feb 2006 22:17:51 -0000
@@ -0,0 +1,1164 @@
+/* Copyright (c) 2005 by CodeSourcery, LLC. All rights reserved. */
+
+/** @file benchmarks/mpi_alltoall.cpp
+ @author Jules Bergmann
+ @date 2005-11-06
+ @brief VSIPL++ Library: Benchmark for MPI alltoall.
+
+*/
+
+/***********************************************************************
+ Included Files
+***********************************************************************/
+
+#include <iostream>
+
+#include <vsip/initfin.hpp>
+#include <vsip/support.hpp>
+#include <vsip/math.hpp>
+#include <vsip/map.hpp>
+#include <vsip/impl/profile.hpp>
+
+#include "test.hpp"
+#include "loop.hpp"
+#include "ops_info.hpp"
+
+using namespace vsip;
+
+
+
+template <typename T,
+ typename SBlock,
+ typename DBlock,
+ typename ImplTag>
+class Send;
+
+struct Impl_alltoall;
+struct Impl_isend;
+struct Impl_isend_x;
+struct Impl_persistent;
+struct Impl_persistent_x;
+
+
+
+/***********************************************************************
+ Send class using MPI_alltoall.
+***********************************************************************/
+
+template <typename T,
+ typename SBlock,
+ typename DBlock>
+class Send<T, SBlock, DBlock, Impl_alltoall>
+{
+ typedef typename SBlock::map_type src_map_type;
+ typedef typename DBlock::map_type dst_map_type;
+
+ typedef typename impl::Block_layout<SBlock>::order_type src_order_type;
+ typedef typename impl::Block_layout<DBlock>::order_type dst_order_type;
+
+ typedef typename impl::Distributed_local_block<DBlock>::type
+ dst_local_block_t;
+ typedef typename impl::Distributed_local_block<SBlock>::type
+ src_local_block_t;
+
+ // Constructor.
+public:
+
+ Send(
+ Matrix<T, SBlock> src,
+ Matrix<T, DBlock> dst)
+ : src_ (src),
+ dst_ (dst),
+ src_ext_(src_.local().block()),
+ dst_ext_(dst_.local().block())
+ {
+ using vsip::impl::Type_equal;
+
+ src_map_type src_map_ = src_.block().map();
+ dst_map_type dst_map_ = dst_.block().map();
+
+ // Check assumptions ----------------------------------------------
+
+ // This benchmarking code assumes a particular data layout,
+ // check that inputs match layout before proceeding.
+
+ // Source is row-major, block distributed by row.
+ assert((Type_equal<row2_type, src_order_type>::value));
+ assert(src_map_.num_subblocks(0) >= 1);
+ assert(src_map_.num_subblocks(1) == 1);
+ assert(src_map_.distribution(0) == block);
+ assert(src_.size(0) % src_map_.num_subblocks(0) == 0);
+
+ // Destination is row- or column-major, block distributed by column.
+// assert((Type_equal<col2_type, dst_order_type>::value));
+ assert(dst_map_.num_subblocks(0) == 1);
+ assert(dst_map_.num_subblocks(1) >= 1);
+ assert(dst_map_.distribution(1) == block);
+ assert(dst_.size(1) % dst_map_.num_subblocks(1) == 0);
+
+ // Create send-side datatype.
+ length_type nrows_per_send = src_.size(0) / src_map_.num_subblocks(0);
+ length_type ncols_per_recv = dst_.size(1) / dst_map_.num_subblocks(1);
+
+// printf("(%d): nrows_per_send = %d, ncols_per_recv = %d\n",
+// rank, nrows_per_send, ncols_per_recv);
+
+ // Setup source datatype. -----------------------------------------
+
+ assert(src_ext_.stride(1) == 1);
+
+ MPI_Type_vector(nrows_per_send, // number of blocks
+ ncols_per_recv, // elements per block
+ src_.size(1), // stride between block starts
+ MPI_FLOAT,
+ &tmp0_datatype_);
+ MPI_Type_commit(&tmp0_datatype_);
+
+ int lena[2] = { 1, 1 };
+ MPI_Aint loca[2] = { 0, ncols_per_recv * sizeof(T) };
+ MPI_Datatype typesa[2] = { tmp0_datatype_, MPI_UB };
+ MPI_Type_struct(2, lena, loca, typesa, &src_datatype_);
+ MPI_Type_commit(&src_datatype_);
+
+
+
+ // Setup destination datatype -------------------------------------
+
+ if (Type_equal<dst_order_type, col2_type>::value)
+ {
+ assert(dst_ext.stride(0) == 1);
+
+ MPI_Type_vector (ncols_per_recv, // number of blocks
+ 1,
+ dst_.size(0),
+ MPI_FLOAT,
+ &tmp1_datatype_);
+ MPI_Type_commit(&tmp1_datatype_);
+
+ int len[3] = { 1, 1, 1 };
+ MPI_Aint loc[3] = { 0, 0, sizeof(T) };
+ MPI_Datatype types[3] = { tmp1_datatype_, MPI_LB, MPI_UB };
+ MPI_Type_struct(3, len, loc, types, &tmp2_datatype_);
+ MPI_Type_commit(&tmp2_datatype_);
+
+
+ MPI_Type_hvector(nrows_per_send, 1, sizeof(float), tmp2_datatype_,
+ &dst_datatype_);
+ MPI_Type_commit(&dst_datatype_);
+ }
+ else
+ {
+ assert(dst_ext.stride(1) == 1);
+
+ MPI_Type_vector(nrows_per_send, // number of blocks
+ ncols_per_recv, // elements per block
+ ncols_per_recv, // stride between block starts
+ MPI_FLOAT,
+ &dst_datatype_);
+ MPI_Type_commit(&dst_datatype_);
+ }
+
+ }
+
+ ~Send()
+ {
+ using vsip::impl::Type_equal;
+
+ MPI_Type_free(&tmp0_datatype_);
+ if (Type_equal<dst_order_type, col2_type>::value)
+ {
+ MPI_Type_free(&tmp1_datatype_);
+ MPI_Type_free(&tmp2_datatype_);
+ }
+ MPI_Type_free(&src_datatype_);
+ MPI_Type_free(&dst_datatype_);
+ }
+
+ void operator()()
+ {
+ MPI_Alltoall(src_ext_.data(), 1, src_datatype_,
+ dst_ext_.data(), 1, dst_datatype_,
+ MPI_COMM_WORLD);
+ }
+
+ // Member data.
+private:
+ Matrix<T, SBlock> src_;
+ Matrix<T, DBlock> dst_;
+
+ impl::Ext_data<dst_local_block_t> dst_ext_;
+ impl::Ext_data<src_local_block_t> src_ext_;
+
+ MPI_Datatype src_datatype_;
+ MPI_Datatype dst_datatype_;
+ MPI_Datatype tmp0_datatype_;
+ MPI_Datatype tmp1_datatype_;
+ MPI_Datatype tmp2_datatype_;
+};
+
+
+
+/***********************************************************************
+ Send class using MPI_isend / MPI_recv
+***********************************************************************/
+
+template <typename T,
+ typename SBlock,
+ typename DBlock>
+class Send<T, SBlock, DBlock, Impl_isend>
+{
+ typedef typename SBlock::map_type src_map_type;
+ typedef typename DBlock::map_type dst_map_type;
+
+ typedef typename impl::Block_layout<SBlock>::order_type src_order_type;
+ typedef typename impl::Block_layout<DBlock>::order_type dst_order_type;
+
+ typedef typename impl::Distributed_local_block<DBlock>::type
+ dst_local_block_t;
+ typedef typename impl::Distributed_local_block<SBlock>::type
+ src_local_block_t;
+
+ // Constructor.
+public:
+
+ Send(
+ Matrix<T, SBlock> src,
+ Matrix<T, DBlock> dst)
+ : src_ (src),
+ dst_ (dst),
+ src_ext_(src_.local().block()),
+ dst_ext_(dst_.local().block()),
+ nproc_ (src_.block().map().num_subblocks(0)),
+ req_ (new MPI_Request[nproc_])
+ {
+ using vsip::impl::Type_equal;
+
+ src_map_type src_map_ = src_.block().map();
+ dst_map_type dst_map_ = dst_.block().map();
+
+ // Check assumptions ----------------------------------------------
+
+ // This benchmarking code assumes a particular data layout,
+ // check that inputs match layout before proceeding.
+
+ // Source is row-major, block distributed by row.
+ assert((Type_equal<row2_type, src_order_type>::value));
+ assert(src_map_.num_subblocks(0) >= 1);
+ assert(src_map_.num_subblocks(1) == 1);
+ assert(src_map_.distribution(0) == block);
+ assert(src_.size(0) % src_map_.num_subblocks(0) == 0);
+
+ // Destination is row- or column-major, block distributed by column.
+ assert(dst_map_.num_subblocks(0) == 1);
+ assert(dst_map_.num_subblocks(1) >= 1);
+ assert(dst_map_.distribution(1) == block);
+ assert(dst_.size(1) % dst_map_.num_subblocks(1) == 0);
+
+ // Number of senders == number of receivers
+ assert(src_map_.num_subblocks(0) == dst_map_.num_subblocks(1));
+
+ // Create send-side datatype.
+ nrows_per_send_ = src_.size(0) / src_map_.num_subblocks(0);
+ ncols_per_recv_ = dst_.size(1) / dst_map_.num_subblocks(1);
+
+// printf("(%d): nrows_per_send = %d, ncols_per_recv = %d\n",
+// rank, nrows_per_send_, ncols_per_recv_);
+
+ // Setup source datatype. -----------------------------------------
+
+ assert(src_ext_.stride(1) == 1);
+
+ MPI_Type_vector(nrows_per_send_, // number of blocks
+ ncols_per_recv_, // elements per block
+ src_.size(1), // stride between block starts
+ MPI_FLOAT,
+ &tmp0_datatype_);
+ MPI_Type_commit(&tmp0_datatype_);
+
+ int lena[2] = { 1, 1 };
+ MPI_Aint loca[2] = { 0, ncols_per_recv_ * sizeof(T) };
+ MPI_Datatype typesa[2] = { tmp0_datatype_, MPI_UB };
+ MPI_Type_struct(2, lena, loca, typesa, &src_datatype_);
+ MPI_Type_commit(&src_datatype_);
+
+
+
+ // Setup destination datatype -------------------------------------
+
+ if (Type_equal<dst_order_type, col2_type>::value)
+ {
+ assert(dst_ext.stride(0) == 1);
+
+ MPI_Type_vector (ncols_per_recv_, // number of blocks
+ 1,
+ dst_.size(0),
+ MPI_FLOAT,
+ &tmp1_datatype_);
+ MPI_Type_commit(&tmp1_datatype_);
+
+ int len[3] = { 1, 1, 1 };
+ MPI_Aint loc[3] = { 0, 0, sizeof(T) };
+ MPI_Datatype types[3] = { tmp1_datatype_, MPI_LB, MPI_UB };
+ MPI_Type_struct(3, len, loc, types, &tmp2_datatype_);
+ MPI_Type_commit(&tmp2_datatype_);
+
+
+ MPI_Type_hvector(nrows_per_send_, 1, sizeof(float), tmp2_datatype_,
+ &dst_datatype_);
+ MPI_Type_commit(&dst_datatype_);
+ }
+ else
+ {
+ assert(dst_ext.stride(1) == 1);
+
+ MPI_Type_vector(nrows_per_send_, // number of blocks
+ ncols_per_recv_, // elements per block
+ ncols_per_recv_, // stride between block starts
+ MPI_FLOAT,
+ &dst_datatype_);
+ MPI_Type_commit(&dst_datatype_);
+ }
+
+ }
+
+ ~Send()
+ {
+ using vsip::impl::Type_equal;
+
+ MPI_Type_free(&tmp0_datatype_);
+ if (Type_equal<dst_order_type, col2_type>::value)
+ {
+ MPI_Type_free(&tmp1_datatype_);
+ MPI_Type_free(&tmp2_datatype_);
+ }
+ MPI_Type_free(&src_datatype_);
+ MPI_Type_free(&dst_datatype_);
+
+ delete[] req_;
+ }
+
+ void operator()()
+ {
+ for (index_type i=0; i<nproc_; ++i)
+ {
+ MPI_Isend(src_ext_.data() + i*ncols_per_recv_,
+ 1, src_datatype_,
+ i, 0, MPI_COMM_WORLD, &(req_[i]));
+ }
+
+ if (impl::Type_equal<dst_order_type, col2_type>::value)
+ {
+ for (index_type i=0; i<nproc_; ++i)
+ {
+ MPI_Status status;
+ int ierr = MPI_Recv(dst_ext_.data() + i*nrows_per_send_,
+ 1, dst_datatype_, i, 0,
+ MPI_COMM_WORLD, &status);
+ }
+ }
+ else
+ {
+ for (index_type i=0; i<nproc_; ++i)
+ {
+ MPI_Status status;
+ int ierr = MPI_Recv(dst_ext_.data()
+ + i*nrows_per_send_*ncols_per_recv_,
+ 1, dst_datatype_, i, 0,
+ MPI_COMM_WORLD, &status);
+ }
+ }
+
+ for (index_type i=0; i<nproc_; ++i)
+ {
+ MPI_Status status;
+ MPI_Wait(&(req_[i]), &status);
+ }
+ }
+
+ // Member data.
+private:
+ Matrix<T, SBlock> src_;
+ Matrix<T, DBlock> dst_;
+
+ impl::Ext_data<dst_local_block_t> dst_ext_;
+ impl::Ext_data<src_local_block_t> src_ext_;
+
+ length_type nproc_;
+ length_type nrows_per_send_;
+ length_type ncols_per_recv_;
+
+ MPI_Request* req_;
+
+ MPI_Datatype src_datatype_;
+ MPI_Datatype dst_datatype_;
+ MPI_Datatype tmp0_datatype_;
+ MPI_Datatype tmp1_datatype_;
+ MPI_Datatype tmp2_datatype_;
+};
+
+
+
+/***********************************************************************
+ Send class using MPI_isend / MPI_recv + local copy
+***********************************************************************/
+
+template <typename T,
+ typename SBlock,
+ typename DBlock>
+class Send<T, SBlock, DBlock, Impl_isend_x>
+{
+ typedef typename SBlock::map_type src_map_type;
+ typedef typename DBlock::map_type dst_map_type;
+
+ typedef typename impl::Block_layout<SBlock>::order_type src_order_type;
+ typedef typename impl::Block_layout<DBlock>::order_type dst_order_type;
+
+ typedef typename impl::Distributed_local_block<DBlock>::type
+ dst_local_block_t;
+ typedef typename impl::Distributed_local_block<SBlock>::type
+ src_local_block_t;
+
+ // Constructor.
+public:
+
+ Send(
+ Matrix<T, SBlock> src,
+ Matrix<T, DBlock> dst)
+ : src_ (src),
+ dst_ (dst),
+ src_ext_(src_.local().block()),
+ dst_ext_(dst_.local().block()),
+ nproc_ (src_.block().map().num_subblocks(0)),
+ rank_ (src_.block().map().impl_rank()),
+ req_ (new MPI_Request[nproc_])
+ {
+ using vsip::impl::Type_equal;
+
+ src_map_type src_map_ = src_.block().map();
+ dst_map_type dst_map_ = dst_.block().map();
+
+ // Check assumptions ----------------------------------------------
+
+ // This benchmarking code assumes a particular data layout,
+ // check that inputs match layout before proceeding.
+
+ // Source is row-major, block distributed by row.
+ assert((Type_equal<row2_type, src_order_type>::value));
+ assert(src_map_.num_subblocks(0) >= 1);
+ assert(src_map_.num_subblocks(1) == 1);
+ assert(src_map_.distribution(0) == block);
+ assert(src_.size(0) % src_map_.num_subblocks(0) == 0);
+
+ // Destination is row- or column-major, block distributed by column.
+ assert(dst_map_.num_subblocks(0) == 1);
+ assert(dst_map_.num_subblocks(1) >= 1);
+ assert(dst_map_.distribution(1) == block);
+ assert(dst_.size(1) % dst_map_.num_subblocks(1) == 0);
+
+ // Number of senders == number of receivers
+ assert(src_map_.num_subblocks(0) == dst_map_.num_subblocks(1));
+
+ // Create send-side datatype.
+ nrows_per_send_ = src_.size(0) / src_map_.num_subblocks(0);
+ ncols_per_recv_ = dst_.size(1) / dst_map_.num_subblocks(1);
+
+// printf("(%d): nrows_per_send = %d, ncols_per_recv = %d\n",
+// rank, nrows_per_send_, ncols_per_recv_);
+
+ // Setup source datatype. -----------------------------------------
+
+ assert(src_ext_.stride(1) == 1);
+
+ MPI_Type_vector(nrows_per_send_, // number of blocks
+ ncols_per_recv_, // elements per block
+ src_.size(1), // stride between block starts
+ MPI_FLOAT,
+ &tmp0_datatype_);
+ MPI_Type_commit(&tmp0_datatype_);
+
+ int lena[2] = { 1, 1 };
+ MPI_Aint loca[2] = { 0, ncols_per_recv_ * sizeof(T) };
+ MPI_Datatype typesa[2] = { tmp0_datatype_, MPI_UB };
+ MPI_Type_struct(2, lena, loca, typesa, &src_datatype_);
+ MPI_Type_commit(&src_datatype_);
+
+
+
+ // Setup destination datatype -------------------------------------
+
+ if (Type_equal<dst_order_type, col2_type>::value)
+ {
+ assert(dst_ext.stride(0) == 1);
+
+ MPI_Type_vector (ncols_per_recv_, // number of blocks
+ 1,
+ dst_.size(0),
+ MPI_FLOAT,
+ &tmp1_datatype_);
+ MPI_Type_commit(&tmp1_datatype_);
+
+ int len[3] = { 1, 1, 1 };
+ MPI_Aint loc[3] = { 0, 0, sizeof(T) };
+ MPI_Datatype types[3] = { tmp1_datatype_, MPI_LB, MPI_UB };
+ MPI_Type_struct(3, len, loc, types, &tmp2_datatype_);
+ MPI_Type_commit(&tmp2_datatype_);
+
+
+ MPI_Type_hvector(nrows_per_send_, 1, sizeof(float), tmp2_datatype_,
+ &dst_datatype_);
+ MPI_Type_commit(&dst_datatype_);
+ }
+ else
+ {
+ assert(dst_ext.stride(1) == 1);
+
+ MPI_Type_vector(nrows_per_send_, // number of blocks
+ ncols_per_recv_, // elements per block
+ ncols_per_recv_, // stride between block starts
+ MPI_FLOAT,
+ &dst_datatype_);
+ MPI_Type_commit(&dst_datatype_);
+ }
+
+ }
+
+ ~Send()
+ {
+ using vsip::impl::Type_equal;
+
+ MPI_Type_free(&tmp0_datatype_);
+ if (Type_equal<dst_order_type, col2_type>::value)
+ {
+ MPI_Type_free(&tmp1_datatype_);
+ MPI_Type_free(&tmp2_datatype_);
+ }
+ MPI_Type_free(&src_datatype_);
+ MPI_Type_free(&dst_datatype_);
+
+ delete[] req_;
+ }
+
+ void operator()()
+ {
+ // Send -----------------------------------------------------------
+ for (index_type i=0; i<nproc_; ++i)
+ {
+ if (i != rank_)
+ MPI_Isend(src_ext_.data() + i*ncols_per_recv_,
+ 1, src_datatype_,
+ i, 0, MPI_COMM_WORLD, &(req_[i]));
+ }
+
+ // Copy -----------------------------------------------------------
+ if (impl::Type_equal<dst_order_type, col2_type>::value)
+ {
+ T* dst = dst_ext_.data() + rank_*nrows_per_send_;
+ T* src = src_ext_.data() + rank_*ncols_per_recv_;
+ impl::transpose_unit(dst, src,
+ nrows_per_send_, ncols_per_recv_,
+ dst_ext_.stride(1),
+ src_ext_.stride(0));
+ }
+ else
+ {
+ T* dst = dst_ext_.data() + rank_*ncols_per_recv_*nrows_per_send_;
+ T* src = src_ext_.data() + rank_*ncols_per_recv_;
+ for (index_type i=0; i<nrows_per_send_; ++i)
+ memcpy(dst + i*ncols_per_recv_,
+ src + i*src_.size(1),
+ ncols_per_recv_ * sizeof(T));
+ }
+
+ // Recv -----------------------------------------------------------
+ if (impl::Type_equal<dst_order_type, col2_type>::value)
+ {
+ for (index_type i=0; i<nproc_; ++i)
+ {
+ MPI_Status status;
+ if (i != rank_)
+ int ierr = MPI_Recv(dst_ext_.data() + i*nrows_per_send_,
+ 1, dst_datatype_, i, 0,
+ MPI_COMM_WORLD, &status);
+ }
+ }
+ else
+ {
+ for (index_type i=0; i<nproc_; ++i)
+ {
+ MPI_Status status;
+ if (i != rank_)
+ int ierr = MPI_Recv(dst_ext_.data()
+ + i*nrows_per_send_*ncols_per_recv_,
+ 1, dst_datatype_, i, 0,
+ MPI_COMM_WORLD, &status);
+ }
+ }
+
+ // Wait for sends to finish ---------------------------------------
+ for (index_type i=0; i<nproc_; ++i)
+ {
+ MPI_Status status;
+ if (i != rank_)
+ MPI_Wait(&(req_[i]), &status);
+ }
+ }
+
+ // Member data.
+private:
+ Matrix<T, SBlock> src_;
+ Matrix<T, DBlock> dst_;
+
+ impl::Ext_data<dst_local_block_t> dst_ext_;
+ impl::Ext_data<src_local_block_t> src_ext_;
+
+ length_type nproc_;
+ index_type rank_;
+ length_type nrows_per_send_;
+ length_type ncols_per_recv_;
+
+ MPI_Request* req_;
+
+ MPI_Datatype src_datatype_;
+ MPI_Datatype dst_datatype_;
+ MPI_Datatype tmp0_datatype_;
+ MPI_Datatype tmp1_datatype_;
+ MPI_Datatype tmp2_datatype_;
+};
+
+
+
+/***********************************************************************
+ Send class using Persistent communications.
+***********************************************************************/
+
+template <typename T,
+ typename SBlock,
+ typename DBlock>
+class Send<T, SBlock, DBlock, Impl_persistent>
+{
+ typedef typename SBlock::map_type src_map_type;
+ typedef typename DBlock::map_type dst_map_type;
+
+ typedef typename impl::Block_layout<SBlock>::order_type src_order_type;
+ typedef typename impl::Block_layout<DBlock>::order_type dst_order_type;
+
+ typedef typename impl::Distributed_local_block<DBlock>::type
+ dst_local_block_t;
+ typedef typename impl::Distributed_local_block<SBlock>::type
+ src_local_block_t;
+
+ // Constructor.
+public:
+
+ Send(
+ Matrix<T, SBlock> src,
+ Matrix<T, DBlock> dst)
+ : src_ (src),
+ dst_ (dst),
+ src_ext_(src_.local().block()),
+ dst_ext_(dst_.local().block()),
+ nproc_ (src_.block().map().num_subblocks(0)),
+ req_ (new MPI_Request[2*nproc_]),
+ status_ (new MPI_Status[2*nproc_])
+ {
+ using vsip::impl::Type_equal;
+
+ src_map_type src_map_ = src_.block().map();
+ dst_map_type dst_map_ = dst_.block().map();
+
+ // Check assumptions ----------------------------------------------
+
+ // This benchmarking code assumes a particular data layout,
+ // check that inputs match layout before proceeding.
+
+ // Source is row-major, block distributed by row.
+ assert((Type_equal<row2_type, src_order_type>::value));
+ assert(src_map_.num_subblocks(0) >= 1);
+ assert(src_map_.num_subblocks(1) == 1);
+ assert(src_map_.distribution(0) == block);
+ assert(src_.size(0) % src_map_.num_subblocks(0) == 0);
+
+ // Destination is row- or column-major, block distributed by column.
+ assert(dst_map_.num_subblocks(0) == 1);
+ assert(dst_map_.num_subblocks(1) >= 1);
+ assert(dst_map_.distribution(1) == block);
+ assert(dst_.size(1) % dst_map_.num_subblocks(1) == 0);
+
+ // Number of senders == number of receivers
+ assert(src_map_.num_subblocks(0) == dst_map_.num_subblocks(1));
+
+ // Create send-side datatype.
+ nrows_per_send_ = src_.size(0) / src_map_.num_subblocks(0);
+ ncols_per_recv_ = dst_.size(1) / dst_map_.num_subblocks(1);
+
+// printf("(%d): nrows_per_send = %d, ncols_per_recv = %d\n",
+// rank, nrows_per_send_, ncols_per_recv_);
+
+ // Setup source datatype. -----------------------------------------
+
+ assert(src_ext_.stride(1) == 1);
+
+ MPI_Type_vector(nrows_per_send_, // number of blocks
+ ncols_per_recv_, // elements per block
+ src_.size(1), // stride between block starts
+ MPI_FLOAT,
+ &tmp0_datatype_);
+ MPI_Type_commit(&tmp0_datatype_);
+
+ int lena[2] = { 1, 1 };
+ MPI_Aint loca[2] = { 0, ncols_per_recv_ * sizeof(T) };
+ MPI_Datatype typesa[2] = { tmp0_datatype_, MPI_UB };
+ MPI_Type_struct(2, lena, loca, typesa, &src_datatype_);
+ MPI_Type_commit(&src_datatype_);
+
+ for (index_type i=0; i<nproc_; ++i)
+ {
+ MPI_Send_init(src_ext_.data() + i*ncols_per_recv_,
+ 1, src_datatype_,
+ i, 0, MPI_COMM_WORLD, &(req_[i]));
+ }
+
+
+
+ // Setup destination datatype -------------------------------------
+
+ if (Type_equal<dst_order_type, col2_type>::value)
+ {
+ assert(dst_ext.stride(0) == 1);
+
+ MPI_Type_vector (ncols_per_recv_, // number of blocks
+ 1,
+ dst_.size(0),
+ MPI_FLOAT,
+ &tmp1_datatype_);
+ MPI_Type_commit(&tmp1_datatype_);
+
+ int len[3] = { 1, 1, 1 };
+ MPI_Aint loc[3] = { 0, 0, sizeof(T) };
+ MPI_Datatype types[3] = { tmp1_datatype_, MPI_LB, MPI_UB };
+ MPI_Type_struct(3, len, loc, types, &tmp2_datatype_);
+ MPI_Type_commit(&tmp2_datatype_);
+
+
+ MPI_Type_hvector(nrows_per_send_, 1, sizeof(float), tmp2_datatype_,
+ &dst_datatype_);
+ MPI_Type_commit(&dst_datatype_);
+
+ for (index_type i=0; i<nproc_; ++i)
+ {
+ int ierr = MPI_Recv_init(dst_ext_.data() + i*nrows_per_send_,
+ 1, dst_datatype_, i, 0,
+ MPI_COMM_WORLD, &(req_[nproc_+i]));
+ }
+ }
+ else
+ {
+ assert(dst_ext.stride(1) == 1);
+
+ MPI_Type_vector(nrows_per_send_, // number of blocks
+ ncols_per_recv_, // elements per block
+ ncols_per_recv_, // stride between block starts
+ MPI_FLOAT,
+ &dst_datatype_);
+ MPI_Type_commit(&dst_datatype_);
+
+ for (index_type i=0; i<nproc_; ++i)
+ {
+ int ierr = MPI_Recv_init(dst_ext_.data()
+ + i*nrows_per_send_*ncols_per_recv_,
+ 1, dst_datatype_, i, 0,
+ MPI_COMM_WORLD, &(req_[nproc_+i]));
+ }
+ }
+
+ }
+
+ ~Send()
+ {
+ using vsip::impl::Type_equal;
+
+ MPI_Type_free(&tmp0_datatype_);
+ if (Type_equal<dst_order_type, col2_type>::value)
+ {
+ MPI_Type_free(&tmp1_datatype_);
+ MPI_Type_free(&tmp2_datatype_);
+ }
+ MPI_Type_free(&src_datatype_);
+ MPI_Type_free(&dst_datatype_);
+
+ for (index_type i=0; i<2*nproc_; ++i)
+ MPI_Request_free(&(req_[i]));
+
+ delete[] req_;
+ delete[] status_;
+ }
+
+ void operator()()
+ {
+ MPI_Startall(2*nproc_, req_);
+ MPI_Waitall (2*nproc_, req_, status_);
+ }
+
+ // Member data.
+private:
+ Matrix<T, SBlock> src_;
+ Matrix<T, DBlock> dst_;
+
+ impl::Ext_data<dst_local_block_t> dst_ext_;
+ impl::Ext_data<src_local_block_t> src_ext_;
+
+ length_type nproc_;
+ length_type nrows_per_send_;
+ length_type ncols_per_recv_;
+
+ MPI_Request* req_;
+ MPI_Status* status_;
+
+ MPI_Datatype src_datatype_;
+ MPI_Datatype dst_datatype_;
+ MPI_Datatype tmp0_datatype_;
+ MPI_Datatype tmp1_datatype_;
+ MPI_Datatype tmp2_datatype_;
+};
+
+
+
+/***********************************************************************
+ Send class using Persistent communications + local copy.
+***********************************************************************/
+
+template <typename T,
+ typename SBlock,
+ typename DBlock>
+class Send<T, SBlock, DBlock, Impl_persistent_x>
+{
+ typedef typename SBlock::map_type src_map_type;
+ typedef typename DBlock::map_type dst_map_type;
+
+ typedef typename impl::Block_layout<SBlock>::order_type src_order_type;
+ typedef typename impl::Block_layout<DBlock>::order_type dst_order_type;
+
+ typedef typename impl::Distributed_local_block<DBlock>::type
+ dst_local_block_t;
+ typedef typename impl::Distributed_local_block<SBlock>::type
+ src_local_block_t;
+
+ // Constructor.
+public:
+
+ Send(
+ Matrix<T, SBlock> src,
+ Matrix<T, DBlock> dst)
+ : src_ (src),
+ dst_ (dst),
+ src_ext_(src_.local().block()),
+ dst_ext_(dst_.local().block()),
+ nproc_ (src_.block().map().num_subblocks(0)),
+ rank_ (src_.block().map().impl_rank()),
+ req_ (new MPI_Request[2*nproc_]),
+ status_ (new MPI_Status[2*nproc_])
+ {
+ using vsip::impl::Type_equal;
+
+ src_map_type src_map_ = src_.block().map();
+ dst_map_type dst_map_ = dst_.block().map();
+
+ // Check assumptions ----------------------------------------------
+
+ // This benchmarking code assumes a particular data layout,
+ // check that inputs match layout before proceeding.
+
+ // Source is row-major, block distributed by row.
+ assert((Type_equal<row2_type, src_order_type>::value));
+ assert(src_map_.num_subblocks(0) >= 1);
+ assert(src_map_.num_subblocks(1) == 1);
+ assert(src_map_.distribution(0) == block);
+ assert(src_.size(0) % src_map_.num_subblocks(0) == 0);
+
+ // Destination is row- or column-major, block distributed by column.
+ assert(dst_map_.num_subblocks(0) == 1);
+ assert(dst_map_.num_subblocks(1) >= 1);
+ assert(dst_map_.distribution(1) == block);
+ assert(dst_.size(1) % dst_map_.num_subblocks(1) == 0);
+
+ // Number of senders == number of receivers
+ assert(src_map_.num_subblocks(0) == dst_map_.num_subblocks(1));
+
+ // Create send-side datatype.
+ nrows_per_send_ = src_.size(0) / src_map_.num_subblocks(0);
+ ncols_per_recv_ = dst_.size(1) / dst_map_.num_subblocks(1);
+
+// printf("(%d): nrows_per_send = %d, ncols_per_recv = %d\n",
+// rank, nrows_per_send_, ncols_per_recv_);
+
+ // Setup source datatype. -----------------------------------------
+
+ assert(src_ext_.stride(1) == 1);
+
+ MPI_Type_vector(nrows_per_send_, // number of blocks
+ ncols_per_recv_, // elements per block
+ src_.size(1), // stride between block starts
+ MPI_FLOAT,
+ &tmp0_datatype_);
+ MPI_Type_commit(&tmp0_datatype_);
+
+ int lena[2] = { 1, 1 };
+ MPI_Aint loca[2] = { 0, ncols_per_recv_ * sizeof(T) };
+ MPI_Datatype typesa[2] = { tmp0_datatype_, MPI_UB };
+ MPI_Type_struct(2, lena, loca, typesa, &src_datatype_);
+ MPI_Type_commit(&src_datatype_);
+
+ int n_req = 0;
+ for (index_type i=0; i<nproc_; ++i)
+ {
+ if (i != rank_)
+ MPI_Send_init(src_ext_.data() + i*ncols_per_recv_,
+ 1, src_datatype_,
+ i, 0, MPI_COMM_WORLD, &(req_[n_req++]));
+ }
+
+
+
+ // Setup destination datatype -------------------------------------
+
+ if (Type_equal<dst_order_type, col2_type>::value)
+ {
+ assert(dst_ext.stride(0) == 1);
+
+ MPI_Type_vector (ncols_per_recv_, // number of blocks
+ 1,
+ dst_.size(0),
+ MPI_FLOAT,
+ &tmp1_datatype_);
+ MPI_Type_commit(&tmp1_datatype_);
+
+ int len[3] = { 1, 1, 1 };
+ MPI_Aint loc[3] = { 0, 0, sizeof(T) };
+ MPI_Datatype types[3] = { tmp1_datatype_, MPI_LB, MPI_UB };
+ MPI_Type_struct(3, len, loc, types, &tmp2_datatype_);
+ MPI_Type_commit(&tmp2_datatype_);
+
+
+ MPI_Type_hvector(nrows_per_send_, 1, sizeof(float), tmp2_datatype_,
+ &dst_datatype_);
+ MPI_Type_commit(&dst_datatype_);
+
+ for (index_type i=0; i<nproc_; ++i)
+ {
+ if (i != rank_)
+ int ierr = MPI_Recv_init(dst_ext_.data() + i*nrows_per_send_,
+ 1, dst_datatype_, i, 0,
+ MPI_COMM_WORLD, &(req_[n_req++]));
+ }
+ }
+ else
+ {
+ assert(dst_ext.stride(1) == 1);
+
+ MPI_Type_vector(nrows_per_send_, // number of blocks
+ ncols_per_recv_, // elements per block
+ ncols_per_recv_, // stride between block starts
+ MPI_FLOAT,
+ &dst_datatype_);
+ MPI_Type_commit(&dst_datatype_);
+
+ for (index_type i=0; i<nproc_; ++i)
+ {
+ if (i != rank_)
+ int ierr = MPI_Recv_init(dst_ext_.data()
+ + i*nrows_per_send_*ncols_per_recv_,
+ 1, dst_datatype_, i, 0,
+ MPI_COMM_WORLD, &(req_[n_req++]));
+ }
+ }
+
+ }
+
+ ~Send()
+ {
+ using vsip::impl::Type_equal;
+
+ MPI_Type_free(&tmp0_datatype_);
+ if (Type_equal<dst_order_type, col2_type>::value)
+ {
+ MPI_Type_free(&tmp1_datatype_);
+ MPI_Type_free(&tmp2_datatype_);
+ }
+ MPI_Type_free(&src_datatype_);
+ MPI_Type_free(&dst_datatype_);
+
+ for (index_type i=0; i<2*(nproc_-1); ++i)
+ MPI_Request_free(&(req_[i]));
+
+ delete[] req_;
+ delete[] status_;
+ }
+
+ void operator()()
+ {
+ if (nproc_ > 1)
+ MPI_Startall(2*(nproc_-1), req_);
+ if (impl::Type_equal<dst_order_type, col2_type>::value)
+ {
+ T* dst = dst_ext_.data() + rank_*nrows_per_send_;
+ T* src = src_ext_.data() + rank_*ncols_per_recv_;
+ impl::transpose_unit(dst, src,
+ nrows_per_send_, ncols_per_recv_,
+ dst_ext_.stride(1),
+ src_ext_.stride(0));
+ }
+ else
+ {
+ T* dst = dst_ext_.data() + rank_*ncols_per_recv_*nrows_per_send_;
+ T* src = src_ext_.data() + rank_*ncols_per_recv_;
+ for (index_type i=0; i<nrows_per_send_; ++i)
+ memcpy(dst + i*ncols_per_recv_,
+ src + i*src_.size(1),
+ ncols_per_recv_ * sizeof(T));
+ }
+ if (nproc_ > 1)
+ MPI_Waitall (2*(nproc_-1), req_, status_);
+ }
+
+ // Member data.
+private:
+ Matrix<T, SBlock> src_;
+ Matrix<T, DBlock> dst_;
+
+ impl::Ext_data<dst_local_block_t> dst_ext_;
+ impl::Ext_data<src_local_block_t> src_ext_;
+
+ length_type nproc_;
+ index_type rank_;
+ length_type nrows_per_send_;
+ length_type ncols_per_recv_;
+
+ MPI_Request* req_;
+ MPI_Status* status_;
+
+ MPI_Datatype src_datatype_;
+ MPI_Datatype dst_datatype_;
+ MPI_Datatype tmp0_datatype_;
+ MPI_Datatype tmp1_datatype_;
+ MPI_Datatype tmp2_datatype_;
+};
+
+
+
+template <typename T,
+ typename SrcOrder,
+ typename DstOrder,
+ typename ImplTag>
+struct t_alltoall
+{
+ char* what() { return "t_alltoall"; }
+ int ops_per_point(length_type rows) { return rows*ratio_; }
+ int riob_per_point(length_type rows) { return rows*ratio_*sizeof(T); }
+ int wiob_per_point(length_type rows) { return rows*ratio_*sizeof(T); }
+ int mem_per_point(length_type rows) { return rows*ratio_*sizeof(T); }
+
+ void operator()(length_type size, length_type loop, float& time)
+ {
+ typedef Dense<2, T, SrcOrder, Map<> > src_block_t;
+ typedef Dense<2, T, DstOrder, Map<> > dst_block_t;
+
+ processor_type np = vsip::num_processors();
+ index_type rank = vsip::local_processor();
+
+ length_type nrows = size;
+ length_type ncols = ratio_*size;
+
+ Map<> root_map(1, 1);
+ Map<> src_map(np, 1);
+ Map<> dst_map(1, np);
+
+ Matrix<float, src_block_t> chk(nrows, ncols, root_map);
+ Matrix<float, src_block_t> src(nrows, ncols, src_map);
+ Matrix<float, dst_block_t> dst(nrows, ncols, dst_map);
+
+ Send<T, src_block_t, dst_block_t, ImplTag> send(src, dst);
+
+ // Initialize chk
+ if (root_map.subblock() != no_subblock)
+ {
+ for (index_type r=0; r<nrows; ++r)
+ for (index_type c=0; c<ncols; ++c)
+ chk.local()(r, c) = T(rank*ncols*nrows+r*ncols+c);
+ }
+
+ // Scatter chk to src
+ src = chk;
+
+ // Clear chk
+ chk = T();
+
+ // Assign src to dst
+ vsip::impl::profile::Timer t1;
+
+ t1.start();
+ for (index_type l=0; l<loop; ++l)
+ send();
+ t1.stop();
+
+ // Gather dst to chk
+ chk = dst;
+
+ // Check chk.
+ if (root_map.subblock() != no_subblock)
+ {
+ for (index_type r=0; r<nrows; ++r)
+ for (index_type c=0; c<ncols; ++c)
+ test_assert(chk.local().get(r, c) == T(rank*ncols*nrows+r*ncols+c));
+ }
+
+ time = t1.delta();
+ }
+
+ t_alltoall(length_type ratio) : ratio_(ratio) {};
+
+// Member data:
+ length_type ratio_;
+};
+
+
+
+void
+defaults(Loop1P& loop)
+{
+ loop.user_param_ = 1;
+ loop.stop_ = 11;
+}
+
+
+
+int
+test(Loop1P& loop, int what)
+{
+ // Ratio is the number of columns per row.
+ length_type ratio = loop.user_param_;
+
+ typedef row2_type Rt;
+ typedef col2_type Ct;
+
+ switch (what)
+ {
+ case 1: loop(t_alltoall<float, Rt, Rt, Impl_alltoall>(ratio)); break;
+ case 2: loop(t_alltoall<float, Rt, Ct, Impl_alltoall>(ratio)); break;
+
+ case 11: loop(t_alltoall<float, Rt, Rt, Impl_isend>(ratio)); break;
+ case 12: loop(t_alltoall<float, Rt, Ct, Impl_isend>(ratio)); break;
+
+ case 21: loop(t_alltoall<float, Rt, Rt, Impl_isend_x>(ratio)); break;
+ case 22: loop(t_alltoall<float, Rt, Ct, Impl_isend_x>(ratio)); break;
+
+ case 31: loop(t_alltoall<float, Rt, Rt, Impl_persistent>(ratio)); break;
+ case 32: loop(t_alltoall<float, Rt, Ct, Impl_persistent>(ratio)); break;
+
+ case 41: loop(t_alltoall<float, Rt, Rt, Impl_persistent_x>(ratio)); break;
+ case 42: loop(t_alltoall<float, Rt, Ct, Impl_persistent_x>(ratio)); break;
+
+ default:
+ return 0;
+ }
+ return 1;
+}
Index: benchmarks/qrd.cpp
===================================================================
RCS file: /home/cvs/Repository/vpp/benchmarks/qrd.cpp,v
retrieving revision 1.2
diff -u -r1.2 qrd.cpp
--- benchmarks/qrd.cpp 7 Sep 2005 12:19:30 -0000 1.2
+++ benchmarks/qrd.cpp 28 Feb 2006 22:17:51 -0000
@@ -72,6 +72,7 @@
int riob_per_point(length_type) { return -1*sizeof(T); }
int wiob_per_point(length_type) { return -1*sizeof(T); }
+ int mem_per_point(length_type size) { return size*sizeof(T); }
void operator()(length_type size, length_type loop, float& time)
{
Index: benchmarks/vmul.cpp
===================================================================
RCS file: /home/cvs/Repository/vpp/benchmarks/vmul.cpp,v
retrieving revision 1.6
diff -u -r1.6 vmul.cpp
--- benchmarks/vmul.cpp 27 Jan 2006 13:13:23 -0000 1.6
+++ benchmarks/vmul.cpp 28 Feb 2006 22:17:51 -0000
@@ -16,6 +16,7 @@
#include <vsip/initfin.hpp>
#include <vsip/support.hpp>
#include <vsip/math.hpp>
+#include <vsip/random.hpp>
#include <vsip/impl/profile.hpp>
#include "test.hpp"
@@ -26,6 +27,10 @@
+/***********************************************************************
+ Definitions - vector element-wise multiply
+***********************************************************************/
+
template <typename T>
struct t_vmul1
{
@@ -33,6 +38,7 @@
int ops_per_point(length_type) { return Ops_info<T>::mul; }
int riob_per_point(length_type) { return 2*sizeof(T); }
int wiob_per_point(length_type) { return 1*sizeof(T); }
+ int mem_per_point(length_type) { return 3*sizeof(T); }
void operator()(length_type size, length_type loop, float& time)
VSIP_IMPL_NOINLINE
@@ -41,6 +47,10 @@
Vector<T> B(size, T());
Vector<T> C(size);
+ Rand<T> gen(0, 0);
+ A = gen.randu(size);
+ B = gen.randu(size);
+
A(0) = T(3);
B(0) = T(4);
@@ -56,6 +66,92 @@
std::cout << "t_vmul1: ERROR" << std::endl;
abort();
}
+
+ for (index_type i=0; i<size; ++i)
+ test_assert(equal(C(i), A(i) * B(i)));
+
+ time = t1.delta();
+ }
+};
+
+
+
+template <typename T>
+struct t_vmul_ip1
+{
+ char* what() { return "t_vmul_ip1"; }
+ int ops_per_point(length_type) { return Ops_info<T>::mul; }
+ int riob_per_point(length_type) { return 2*sizeof(T); }
+ int wiob_per_point(length_type) { return 1*sizeof(T); }
+ int mem_per_point(length_type) { return 3*sizeof(T); }
+
+ void operator()(length_type size, length_type loop, float& time)
+ VSIP_IMPL_NOINLINE
+ {
+ Vector<T> A(size, T(1));
+ Vector<T> C(size);
+ Vector<T> chk(size);
+
+ Rand<T> gen(0, 0);
+ chk = gen.randu(size);
+ C = chk;
+
+ vsip::impl::profile::Timer t1;
+
+ t1.start();
+ for (index_type l=0; l<loop; ++l)
+ C *= A;
+ t1.stop();
+
+ for (index_type i=0; i<size; ++i)
+ test_assert(equal(chk(i), C(i)));
+
+ time = t1.delta();
+ }
+};
+
+
+
+template <typename T>
+struct t_vmul_dom1
+{
+ char* what() { return "t_vmul_dom1"; }
+ int ops_per_point(length_type) { return Ops_info<T>::mul; }
+ int riob_per_point(length_type) { return 2*sizeof(T); }
+ int wiob_per_point(length_type) { return 1*sizeof(T); }
+ int mem_per_point(length_type) { return 3*sizeof(T); }
+
+ void operator()(length_type size, length_type loop, float& time)
+ VSIP_IMPL_NOINLINE
+ {
+ Vector<T> A(size, T());
+ Vector<T> B(size, T());
+ Vector<T> C(size);
+
+ Rand<T> gen(0, 0);
+ A = gen.randu(size);
+ B = gen.randu(size);
+
+ A(0) = T(3);
+ B(0) = T(4);
+
+ Domain<1> dom(size);
+
+ vsip::impl::profile::Timer t1;
+
+ t1.start();
+ for (index_type l=0; l<loop; ++l)
+ C(dom) = A(dom) * B(dom);
+ t1.stop();
+
+ if (!equal(C(0), T(12)))
+ {
+ std::cout << "t_vmul1: ERROR" << std::endl;
+ abort();
+ }
+
+ for (index_type i=0; i<size; ++i)
+ test_assert(equal(C(i), A(i) * B(i)));
time = t1.delta();
}
@@ -70,6 +166,7 @@
int ops_per_point(length_type) { return Ops_info<T>::mul; }
int riob_per_point(length_type) { return 2*sizeof(T); }
int wiob_per_point(length_type) { return 1*sizeof(T); }
+ int mem_per_point(length_type) { return 3*sizeof(T); }
void operator()(length_type size, length_type loop, float& time)
VSIP_IMPL_NOINLINE
@@ -104,23 +201,74 @@
-// Benchmark scalar-view vector multiply (Scalar * View)
+/***********************************************************************
+ Definitions - real * complex vector element-wise multiply
+***********************************************************************/
template <typename T>
+struct t_rcvmul1
+{
+ char* what() { return "t_rcvmul1"; }
+ int ops_per_point(length_type) { return 2*Ops_info<T>::mul; }
+ int riob_per_point(length_type) { return sizeof(T) + sizeof(complex<T>); }
+ int wiob_per_point(length_type) { return 1*sizeof(complex<T>); }
+ int mem_per_point(length_type) { return 1*sizeof(T)+2*sizeof(complex<T>); }
+
+ void operator()(length_type size, length_type loop, float& time)
+ VSIP_IMPL_NOINLINE
+ {
+ Vector<complex<T> > A(size);
+ Vector<T> B(size);
+ Vector<complex<T> > C(size);
+
+ Rand<complex<T> > cgen(0, 0);
+ Rand<T> sgen(0, 0);
+
+ A = cgen.randu(size);
+ B = sgen.randu(size);
+
+ vsip::impl::profile::Timer t1;
+
+ t1.start();
+ for (index_type l=0; l<loop; ++l)
+ C = A * B;
+ t1.stop();
+
+ for (index_type i=0; i<size; ++i)
+ test_assert(equal(C(i), A(i) * B(i)));
+
+ time = t1.delta();
+ }
+};
+
+
+
+// Benchmark scalar-view vector multiply (Scalar * View)
+
+template <typename ScalarT,
+ typename T>
struct t_svmul1
{
char* what() { return "t_svmul1"; }
- int ops_per_point(length_type) { return Ops_info<T>::mul; }
+ int ops_per_point(length_type)
+ { if (sizeof(ScalarT) == sizeof(T))
+ return Ops_info<T>::mul;
+ else
+ return 2*Ops_info<ScalarT>::mul;
+ }
int riob_per_point(length_type) { return 1*sizeof(T); }
int wiob_per_point(length_type) { return 1*sizeof(T); }
+ int mem_per_point(length_type) { return 2*sizeof(T); }
void operator()(length_type size, length_type loop, float& time)
{
Vector<T> A(size, T());
Vector<T> C(size);
- T alpha = T(3);
+ ScalarT alpha = ScalarT(3);
+ Rand<T> gen(0, 0);
+ A = gen.randu(size);
A(0) = T(4);
vsip::impl::profile::Timer t1;
@@ -130,7 +278,8 @@
C = alpha * A;
t1.stop();
- test_assert(equal(C(0), T(12)));
+ for (index_type i=0; i<size; ++i)
+ test_assert(equal(C(i), alpha * A(i)));
time = t1.delta();
}
@@ -147,6 +296,7 @@
int ops_per_point(length_type) { return Ops_info<T>::mul; }
int riob_per_point(length_type) { return 1*sizeof(T); }
int wiob_per_point(length_type) { return 1*sizeof(T); }
+ int mem_per_point(length_type) { return 2*sizeof(T); }
void operator()(length_type size, length_type loop, float& time)
{
@@ -188,11 +338,21 @@
case 2: loop(t_vmul1<complex<float> >()); break;
case 3: loop(t_vmul2<complex<float>, impl::Cmplx_inter_fmt>()); break;
case 4: loop(t_vmul2<complex<float>, impl::Cmplx_split_fmt>()); break;
+ case 5: loop(t_rcvmul1<float>()); break;
+
+ case 11: loop(t_svmul1<float, float>()); break;
+ case 12: loop(t_svmul1<float, complex<float> >()); break;
+ case 13: loop(t_svmul1<complex<float>, complex<float> >()); break;
+
+ case 14: loop(t_svmul2<float>()); break;
+ case 15: loop(t_svmul2<complex<float> >()); break;
+
+ case 21: loop(t_vmul_dom1<float>()); break;
+ case 22: loop(t_vmul_dom1<complex<float> >()); break;
+
+ case 31: loop(t_vmul_ip1<float>()); break;
+ case 32: loop(t_vmul_ip1<complex<float> >()); break;
- case 11: loop(t_svmul1<float>()); break;
- case 12: loop(t_svmul1<complex<float> >()); break;
- case 13: loop(t_svmul2<float>()); break;
- case 14: loop(t_svmul2<complex<float> >()); break;
default:
return 0;
}
Index: benchmarks/vmul_c.cpp
===================================================================
RCS file: /home/cvs/Repository/vpp/benchmarks/vmul_c.cpp,v
retrieving revision 1.1
diff -u -r1.1 vmul_c.cpp
--- benchmarks/vmul_c.cpp 27 Jan 2006 13:13:23 -0000 1.1
+++ benchmarks/vmul_c.cpp 28 Feb 2006 22:17:51 -0000
@@ -21,13 +21,16 @@
#include "test.hpp"
#include "loop.hpp"
#include "ops_info.hpp"
-#define USE_SSE 1
-#include "sse.h"
using namespace vsip;
+/***********************************************************************
+ Definitions -- generic vector-multiply
+***********************************************************************/
+
+
template <typename T>
inline void
vmul(
@@ -64,6 +67,10 @@
+/***********************************************************************
+ t_vmul1 -- Benchmark view-view vector multiply
+***********************************************************************/
+
template <typename T>
struct t_vmul1
{
@@ -71,6 +78,7 @@
int ops_per_point(length_type) { return Ops_info<T>::mul; }
int riob_per_point(length_type) { return 2*sizeof(T); }
int wiob_per_point(length_type) { return 1*sizeof(T); }
+ int mem_per_point(length_type) { return 3*sizeof(T); }
void operator()(length_type size, length_type loop, float& time)
{
@@ -113,7 +121,9 @@
-// Benchmark scalar-view vector multiply (Scalar * View)
+/***********************************************************************
+ t_svmul1 -- Benchmark scalar-view vector multiply
+***********************************************************************/
template <typename T>
struct t_svmul1
@@ -122,6 +132,7 @@
int ops_per_point(length_type) { return Ops_info<T>::mul; }
int riob_per_point(length_type) { return 1*sizeof(T); }
int wiob_per_point(length_type) { return 1*sizeof(T); }
+ int mem_per_point(length_type) { return 3*sizeof(T); }
void operator()(length_type size, length_type loop, float& time)
{
Index: benchmarks/vmul_ipp.cpp
===================================================================
RCS file: /home/cvs/Repository/vpp/benchmarks/vmul_ipp.cpp,v
retrieving revision 1.2
diff -u -r1.2 vmul_ipp.cpp
--- benchmarks/vmul_ipp.cpp 19 Dec 2005 16:08:55 -0000 1.2
+++ benchmarks/vmul_ipp.cpp 28 Feb 2006 22:17:51 -0000
@@ -33,10 +33,12 @@
template <>
struct t_vmul_ipp<float>
{
+ typedef float T;
char* what() { return "t_vmul_ipp"; }
- int ops_per_point(size_t) { return Ops_info<float>::mul; }
- int riob_per_point(size_t) { return 2*sizeof(float); }
- int wiob_per_point(size_t) { return 1*sizeof(float); }
+ int ops_per_point(size_t) { return Ops_info<T>::mul; }
+ int riob_per_point(size_t) { return 2*sizeof(T); }
+ int wiob_per_point(size_t) { return 1*sizeof(T); }
+ int mem_per_point(size_t) { return 3*sizeof(T); }
void operator()(size_t size, size_t loop, float& time)
{
@@ -87,6 +89,7 @@
int ops_per_point(size_t) { return Ops_info<T>::mul; }
int riob_per_point(size_t) { return 2*sizeof(T); }
int wiob_per_point(size_t) { return 1*sizeof(T); }
+ int mem_per_point(size_t) { return 3*sizeof(T); }
void operator()(size_t size, size_t loop, float& time)
{
@@ -139,6 +142,7 @@
int ops_per_point(size_t) { return Ops_info<T>::mul; }
int riob_per_point(size_t) { return 2*sizeof(T); }
int wiob_per_point(size_t) { return 1*sizeof(T); }
+ int mem_per_point(size_t) { return 2*sizeof(T); }
void operator()(size_t size, size_t loop, float& time)
{
@@ -189,6 +193,7 @@
int ops_per_point(size_t) { return Ops_info<T>::mul; }
int riob_per_point(size_t) { return 2*sizeof(T); }
int wiob_per_point(size_t) { return 1*sizeof(T); }
+ int mem_per_point(size_t) { return 2*sizeof(T); }
void operator()(size_t size, size_t loop, float& time)
{
Index: doc/csl-docbook/GNUmakefile.inc
===================================================================
RCS file: /home/cvs/Repository/csl-docbook/GNUmakefile.inc,v
retrieving revision 1.11
diff -u -r1.11 GNUmakefile.inc
--- doc/csl-docbook/GNUmakefile.inc 24 Jan 2006 20:08:30 -0000 1.11
+++ doc/csl-docbook/GNUmakefile.inc 28 Feb 2006 22:17:53 -0000
@@ -93,9 +93,16 @@
install-html-$(notdir $(1)): $(1) install-htmldir
if test -r $(1); then \
$(INSTALL) -d $(DESTDIR)$(htmldir)/$(notdir $(1)); \
- $(INSTALL_DATA) $(1).html $(DESTDIR)$(htmldir)/; \
- $(INSTALL_DATA) $(1)/*.html $(DESTDIR)$(htmldir)/$(notdir $(1)); \
- $(INSTALL_DATA) $(1)/*.css $(DESTDIR)$(htmldir)/$(notdir $(1)); \
+ if test -r $(1).html; then \
+ $(INSTALL_DATA) $(1).html $(DESTDIR)$(htmldir)/; \
+ fi; \
+ for file in $(wildcard $(1)/*.html); do \
+ $(INSTALL_DATA) $$file $(DESTDIR)$(libdir); \
+ $(INSTALL_DATA) $$file $(DESTDIR)$(htmldir)/$(notdir $(1)); \
+ done; \
+ for file in $(wildcard $(1)/*.css); do \
+ $(INSTALL_DATA) $$file $(DESTDIR)$(htmldir)/$(notdir $(1)); \
+ done; \
fi
if test -d $(1)/images; then \
$(INSTALL) -d $(DESTDIR)$(htmldir)/$(notdir $(1))/images; \
Index: src/vsip/GNUmakefile.inc.in
===================================================================
RCS file: /home/cvs/Repository/vpp/src/vsip/GNUmakefile.inc.in,v
retrieving revision 1.13
diff -u -r1.13 GNUmakefile.inc.in
--- src/vsip/GNUmakefile.inc.in 4 Jan 2006 19:10:03 -0000 1.13
+++ src/vsip/GNUmakefile.inc.in 28 Feb 2006 22:17:54 -0000
@@ -22,6 +22,7 @@
ifdef VSIP_IMPL_HAVE_SAL
src_vsip_cxx_sources += $(srcdir)/src/vsip/impl/sal.cpp
endif
+src_vsip_cxx_sources += $(srcdir)/src/vsip/impl/simd/vmul.cpp
src_vsip_cxx_objects := $(patsubst $(srcdir)/%.cpp, %.$(OBJEXT), $(src_vsip_cxx_sources))
cxx_sources += $(src_vsip_cxx_sources)
@@ -46,6 +47,7 @@
$(INSTALL) -d $(DESTDIR)$(libdir)
$(INSTALL_DATA) src/vsip/libvsip.a $(DESTDIR)$(libdir)/libvsip$(suffix).a
$(INSTALL) -d $(DESTDIR)$(includedir)/vsip/impl
+ $(INSTALL) -d $(DESTDIR)$(includedir)/vsip/impl/simd
$(INSTALL_DATA) src/vsip/impl/acconfig.hpp $(DESTDIR)$(includedir)/vsip/impl
for header in $(hdr); do \
$(INSTALL_DATA) $(srcdir)/src/$$header \
Index: src/vsip/support.hpp
===================================================================
RCS file: /home/cvs/Repository/vpp/src/vsip/support.hpp,v
retrieving revision 1.26
diff -u -r1.26 support.hpp
--- src/vsip/support.hpp 27 Jan 2006 13:13:23 -0000 1.26
+++ src/vsip/support.hpp 28 Feb 2006 22:17:54 -0000
@@ -52,7 +52,9 @@
/// indication that they should not be inlined, then VSIP_IMPL_NOINLINE
/// is defined to that annotation, otherwise it is defined to
/// nothing.
-#if __GNUC__ >= 2 || defined(__ghs__)
+///
+/// Greenhills does not use this attribute.
+#if __GNUC__ >= 2
# define VSIP_IMPL_NOINLINE __attribute__ ((__noinline__))
#else
# define VSIP_IMPL_NOINLINE
Index: src/vsip/vector.hpp
===================================================================
RCS file: /home/cvs/Repository/vpp/src/vsip/vector.hpp,v
retrieving revision 1.36
diff -u -r1.36 vector.hpp
--- src/vsip/vector.hpp 11 Jan 2006 16:22:44 -0000 1.36
+++ src/vsip/vector.hpp 28 Feb 2006 22:17:54 -0000
@@ -30,6 +30,7 @@
#include <vsip/impl/par-chain-assign.hpp>
#include <vsip/impl/dispatch-assign.hpp>
#include <vsip/impl/lvalue-proxy.hpp>
+#include <vsip/math.hpp>
/***********************************************************************
Declarations
@@ -115,8 +116,11 @@
-#define VSIP_IMPL_ELEMENTWISE_SCALAR(op) \
- for (index_type i = 0; i < this->size(); i++) \
+#define VSIP_IMPL_ELEMENTWISE_SCALAR(op) \
+ *this = *this op val
+
+#define VSIP_IMPL_ELEMENTWISE_SCALAR_NOFWD(op) \
+ for (index_type i = 0; i < this->size(); i++) \
this->put(i, this->get(i) op val)
#define VSIP_IMPL_ELEMENTWISE_VECTOR(op) \
@@ -141,7 +145,7 @@
#define VSIP_IMPL_ASSIGN_OP_NOFWD(asop, op) \
template <typename T0> \
Vector& operator asop(T0 const& val) VSIP_NOTHROW \
- { VSIP_IMPL_ELEMENTWISE_SCALAR(op); return *this;} \
+ { VSIP_IMPL_ELEMENTWISE_SCALAR_NOFWD(op); return *this;} \
template <typename T0, typename Block0> \
Vector& operator asop(const_Vector<T0, Block0> v) VSIP_NOTHROW \
{ VSIP_IMPL_ELEMENTWISE_VECTOR_NOFWD(op); return *this;} \
Index: src/vsip/impl/distributed-block.hpp
===================================================================
RCS file: /home/cvs/Repository/vpp/src/vsip/impl/distributed-block.hpp,v
retrieving revision 1.17
diff -u -r1.17 distributed-block.hpp
--- src/vsip/impl/distributed-block.hpp 11 Jan 2006 16:22:44 -0000 1.17
+++ src/vsip/impl/distributed-block.hpp 28 Feb 2006 22:17:54 -0000
@@ -133,7 +133,7 @@
{
index_type sb = map_.impl_subblock_from_global_index(Index<1>(idx));
processor_type pr = *(map_.processor_begin(sb));
- value_type val;
+ value_type val = value_type(); // avoid -Wall 'may not be initialized'
if (pr == proc_)
{
@@ -152,7 +152,7 @@
index_type sb = map_.impl_subblock_from_global_index(
Index<2>(idx0, idx1));
processor_type pr = *(map_.processor_begin(sb));
- value_type val;
+ value_type val = value_type(); // avoid -Wall 'may not be initialized'
if (pr == proc_)
{
@@ -173,7 +173,7 @@
index_type sb = map_.impl_subblock_from_global_index(
Index<3>(idx0, idx1, idx2));
processor_type pr = *(map_.processor_begin(sb));
- value_type val;
+ value_type val = value_type(); // avoid -Wall 'may not be initialized'
if (pr == proc_)
{
Index: src/vsip/impl/eval-sal.hpp
===================================================================
RCS file: /home/cvs/Repository/vpp/src/vsip/impl/eval-sal.hpp,v
retrieving revision 1.1
diff -u -r1.1 eval-sal.hpp
--- src/vsip/impl/eval-sal.hpp 15 Nov 2005 21:01:27 -0000 1.1
+++ src/vsip/impl/eval-sal.hpp 28 Feb 2006 22:17:54 -0000
@@ -47,7 +47,8 @@
typename Block0,
typename Block1,
typename Block2>
-struct Evaluator<Op_prod_vv_outer, Block0, Op_list_3<T1, Block1, Block2>,
+struct Evaluator<Op_prod_vv_outer, Block0,
+ Op_list_3<T1, Block1 const&, Block2 const&>,
Mercury_sal_tag>
{
static bool const ct_valid =
@@ -59,7 +60,7 @@
Ext_data_cost<Block1>::value == 0 &&
Ext_data_cost<Block2>::value == 0;
- static bool rt_valid(Block0& r, T1 alpha, Block1 const& a, Block2 const& b)
+ static bool rt_valid(Block0& r, T1, Block1 const& a, Block2 const& b)
{
typedef typename Block_layout<Block1>::order_type order0_type;
dimension_type const r_dim1 = order0_type::impl_dim1;
@@ -125,7 +126,7 @@
typename Block1,
typename Block2>
struct Evaluator<Op_prod_vv_outer, Block0,
- Op_list_3<std::complex<T1>, Block1, Block2>,
+ Op_list_3<std::complex<T1>, Block1 const&, Block2 const&>,
Mercury_sal_tag>
{
typedef typename Block_layout<Block0>::complex_type complex0_type;
@@ -144,7 +145,7 @@
Type_equal<complex0_type, complex1_type>::value &&
Type_equal<complex1_type, complex2_type>::value;
- static bool rt_valid(Block0& r, std::complex<T1> alpha,
+ static bool rt_valid(Block0& r, std::complex<T1>,
Block1 const& a, Block2 const& b)
{
typedef typename Block_layout<Block1>::order_type order0_type;
@@ -576,7 +577,7 @@
Type_equal<complex0_type, complex1_type>::value &&
Type_equal<complex1_type, complex2_type>::value;
- static bool rt_valid(Block0& r, Block1 const& a, Block2 const& b)
+ static bool rt_valid(Block0&, Block1 const& a, Block2 const&)
{
Ext_data<Block1> ext_a(const_cast<Block1&>(a));
@@ -584,7 +585,8 @@
dimension_type const a_dim1 = order1_type::impl_dim1;
bool dense_major_dim =
- (ext_a.stride(a_dim0) == ext_a.size(a_dim1) * ext_a.stride(a_dim1));
+ (ext_a.stride(a_dim0) == (stride_type)ext_a.size(a_dim1) *
+ ext_a.stride(a_dim1));
return dense_major_dim;
}
@@ -652,7 +654,7 @@
Type_equal<complex0_type, complex1_type>::value &&
Type_equal<complex1_type, complex2_type>::value;
- static bool rt_valid(Block0& r, Block1 const& a, Block2 const& b)
+ static bool rt_valid(Block0&, Block1 const&, Block2 const& b)
{
Ext_data<Block2> ext_b(const_cast<Block2&>(b));
@@ -660,7 +662,8 @@
dimension_type const b_dim1 = order2_type::impl_dim1;
bool dense_major_dim =
- (ext_b.stride(b_dim0) == ext_b.size(b_dim1) * ext_b.stride(b_dim1));
+ (ext_b.stride(b_dim0) == (stride_type)ext_b.size(b_dim1) *
+ ext_b.stride(b_dim1));
return dense_major_dim;
}
@@ -746,9 +749,12 @@
dimension_type const b_dim1 = order2_type::impl_dim1;
bool dense_major_dim =
- (ext_r.stride(r_dim0) == ext_r.size(r_dim1) * ext_r.stride(r_dim1)) &&
- (ext_a.stride(a_dim0) == ext_a.size(a_dim1) * ext_a.stride(a_dim1)) &&
- (ext_b.stride(b_dim0) == ext_b.size(b_dim1) * ext_b.stride(b_dim1));
+ (ext_r.stride(r_dim0) == (stride_type)ext_r.size(r_dim1) *
+ ext_r.stride(r_dim1)) &&
+ (ext_a.stride(a_dim0) == (stride_type)ext_a.size(a_dim1) *
+ ext_a.stride(a_dim1)) &&
+ (ext_b.stride(b_dim0) == (stride_type)ext_b.size(b_dim1) *
+ ext_b.stride(b_dim1));
return dense_major_dim;
}
@@ -861,18 +867,18 @@
Type_equal<complex0_type, complex1_type>::value &&
Type_equal<complex1_type, complex2_type>::value;
- static bool rt_valid(Block0& r, T1 alpha, Block1 const& a,
- Block2 const& b, T2 beta)
+ static bool rt_valid(Block0& r, T1, Block1 const& a,
+ Block2 const& b, T2)
{
Ext_data<Block0> ext_r(const_cast<Block0&>(r));
Ext_data<Block1> ext_a(const_cast<Block1&>(a));
Ext_data<Block2> ext_b(const_cast<Block2&>(b));
- dimension_type const r_dim0 = order0_type::impl_dim0;
+ // dimension_type const r_dim0 = order0_type::impl_dim0;
dimension_type const r_dim1 = order0_type::impl_dim1;
- dimension_type const a_dim0 = order1_type::impl_dim0;
+ // dimension_type const a_dim0 = order1_type::impl_dim0;
dimension_type const a_dim1 = order1_type::impl_dim1;
- dimension_type const b_dim0 = order2_type::impl_dim0;
+ // dimension_type const b_dim0 = order2_type::impl_dim0;
dimension_type const b_dim1 = order2_type::impl_dim1;
bool unit_minor_stride =
Index: src/vsip/impl/expr_serial_dispatch.hpp
===================================================================
RCS file: /home/cvs/Repository/vpp/src/vsip/impl/expr_serial_dispatch.hpp,v
retrieving revision 1.3
diff -u -r1.3 expr_serial_dispatch.hpp
--- src/vsip/impl/expr_serial_dispatch.hpp 22 Dec 2005 01:29:25 -0000 1.3
+++ src/vsip/impl/expr_serial_dispatch.hpp 28 Feb 2006 22:17:54 -0000
@@ -23,6 +23,14 @@
#ifdef VSIP_IMPL_HAVE_SAL
#include <vsip/impl/sal.hpp>
#endif
+
+#ifdef VSIP_IMPL_HAVE_SIMD_GENERIC
+# include <vsip/impl/simd/eval-generic.hpp>
+#endif
+#ifdef VSIP_IMPL_HAVE_SIMD_3DNOWEXT
+# include <vsip/impl/simd/eval-simd-3dnowext.hpp>
+#endif
+
#include <iostream>
/***********************************************************************
@@ -35,7 +43,8 @@
{
/// The list of evaluators to be tried, in that specific order.
-typedef Make_type_list<Intel_ipp_tag,
+typedef Make_type_list<VSIP_IMPL_SIMD_TAG_LIST
+ Intel_ipp_tag,
Transpose_tag,
Mercury_sal_tag,
Loop_fusion_tag>::type LibraryTagList;
@@ -70,6 +79,7 @@
{
static void exec(DstBlock& dst, SrcBlock const& src)
+ VSIP_NOTHROW
{
if (EvalExpr::rt_valid(dst, src)) EvalExpr::exec(dst, src);
else Serial_dispatch_helper<Dim, DstBlock, SrcBlock, Rest>::exec(dst, src);
@@ -104,6 +114,7 @@
Tag, None_type, EvalExpr, true>
{
static void exec(DstBlock& dst, SrcBlock const& src)
+ VSIP_NOTHROW
{
if (EvalExpr::rt_valid(dst, src)) EvalExpr::exec(dst, src);
else assert(0);
Index: src/vsip/impl/expr_serial_evaluator.hpp
===================================================================
RCS file: /home/cvs/Repository/vpp/src/vsip/impl/expr_serial_evaluator.hpp,v
retrieving revision 1.4
diff -u -r1.4 expr_serial_evaluator.hpp
--- src/vsip/impl/expr_serial_evaluator.hpp 22 Dec 2005 01:29:25 -0000 1.4
+++ src/vsip/impl/expr_serial_evaluator.hpp 28 Feb 2006 22:17:54 -0000
@@ -32,6 +32,7 @@
struct Loop_fusion_tag;
struct Intel_ipp_tag;
struct Mercury_sal_tag;
+struct Simd_tag;
struct Transpose_tag;
/// Serial_expr_evaluator template.
@@ -55,7 +56,8 @@
static void exec(DstBlock& dst, SrcBlock const& src)
{
- for (index_type i=0; i<dst.size(1, 0); ++i)
+ length_type const size = dst.size(1, 0);
+ for (index_type i=0; i<size; ++i)
dst.put(i, src.get(i));
}
};
@@ -156,15 +158,19 @@
static void exec(DstBlock& dst, SrcBlock const& src, row2_type)
{
- for (index_type i=0; i<dst.size(2, 0); ++i)
- for (index_type j=0; j<dst.size(2, 1); ++j)
+ length_type const rows = dst.size(2, 0);
+ length_type const cols = dst.size(2, 1);
+ for (index_type i=0; i<rows; ++i)
+ for (index_type j=0; j<cols; ++j)
dst.put(i, j, src.get(i, j));
}
static void exec(DstBlock& dst, SrcBlock const& src, col2_type)
{
- for (index_type j=0; j<dst.size(2, 1); ++j)
- for (index_type i=0; i<dst.size(2, 0); ++i)
+ length_type const rows = dst.size(2, 0);
+ length_type const cols = dst.size(2, 1);
+ for (index_type j=0; j<cols; ++j)
+ for (index_type i=0; i<rows; ++i)
dst.put(i, j, src.get(i, j));
}
Index: src/vsip/impl/lvalue-proxy.hpp
===================================================================
RCS file: /home/cvs/Repository/vpp/src/vsip/impl/lvalue-proxy.hpp,v
retrieving revision 1.6
diff -u -r1.6 lvalue-proxy.hpp
--- src/vsip/impl/lvalue-proxy.hpp 3 Feb 2006 16:01:36 -0000 1.6
+++ src/vsip/impl/lvalue-proxy.hpp 28 Feb 2006 22:17:54 -0000
@@ -177,9 +177,8 @@
block_(b), coord_(i)
{};
- /// Read access, by implicit conversion to the value type.
- operator value_type() const
- { return lvalue_detail::get(block_, coord_); }
+ /// Since proxy derives from value type, implicit conversion
+ /// is not necessary.
/// Write access, by assignment from the value type.
Lvalue_proxy& operator= (value_type v)
Index: src/vsip/impl/par-chain-assign.hpp
===================================================================
RCS file: /home/cvs/Repository/vpp/src/vsip/impl/par-chain-assign.hpp,v
retri | |||||||