[
Date Prev][
Date Next][
Thread Prev][
Thread Next][
Date Index][
Thread Index]
Re: [vsipl++] [patch] Changes for GTRI build
- Subject: Re: [vsipl++] [patch] Changes for GTRI build
- From: Jules Bergmann <jules@xxxxxxxxxxxxxxxx>
- Date: Mon, 03 Apr 2006 16:37:01 -0400
Oops, forgot the patch. -- Jules
Jules Bergmann wrote:
I'm getting ready to build a snapshot on the GTRI cluster. To do this
cleanly, I need to incorporate the following fixes/feeatures:
- fixes GNUmakefile.in to install headers in src/vsip/impl/simd
- adds a '--version=VERSION' option to scripts. This is useful for
building snapshots when you can't fit everything in to one day!
While I'm at it, I figured I would include the following optimizations:
- optimizes matrix copy to use memcpy or an explicit loop when
possible.
- optimizes transpose to use SIMD instructions when possible.
-- Jules
--
Jules Bergmann
CodeSourcery
jules@xxxxxxxxxxxxxxxx
(650) 331-3385 x705
Index: ChangeLog
===================================================================
RCS file: /home/cvs/Repository/vpp/ChangeLog,v
retrieving revision 1.426
diff -u -r1.426 ChangeLog
--- ChangeLog 3 Apr 2006 19:24:28 -0000 1.426
+++ ChangeLog 3 Apr 2006 20:30:22 -0000
@@ -1,3 +1,13 @@
+2006-04-03 Jules Bergmann <jules@xxxxxxxxxxxxxxxx>
+
+ * GNUmakefile.in (hdr): Add SIMD headers in src/vsip/impl/simd.
+ * scripts/package (--version): New option that sets a specific
+ version (--snapshot is equiv to --version=YYYYMMDD).
+ * src/vsip/impl/expr_serial_evaluator.hpp: Optimize simple matrix
+ copy to use memcpy or explicit loop when possible.
+ * src/vsip/impl/fast-transpose.hpp: New transpose_unit implementation
+ that uses SIMD transpose kernels when possible.
+
2006-04-03 Don McCoy <don@xxxxxxxxxxxxxxxx>
* benchmarks/fastconv.cpp: Updated to use benchmarks.hpp. Separated
Index: GNUmakefile.in
===================================================================
RCS file: /home/cvs/Repository/vpp/GNUmakefile.in,v
retrieving revision 1.45
diff -u -r1.45 GNUmakefile.in
--- GNUmakefile.in 24 Jan 2006 17:33:15 -0000 1.45
+++ GNUmakefile.in 3 Apr 2006 20:30:22 -0000
@@ -181,6 +181,8 @@
$(wildcard $(srcdir)/src/vsip/*.hpp))
hdr += $(patsubst $(srcdir)/src/%, %, \
$(wildcard $(srcdir)/src/vsip/impl/*.hpp))
+hdr += $(patsubst $(srcdir)/src/%, %, \
+ $(wildcard $(srcdir)/src/vsip/impl/simd/*.hpp))
########################################################################
# Included Files
Index: src/vsip/impl/expr_serial_evaluator.hpp
===================================================================
RCS file: /home/cvs/Repository/vpp/src/vsip/impl/expr_serial_evaluator.hpp,v
retrieving revision 1.5
diff -u -r1.5 expr_serial_evaluator.hpp
--- src/vsip/impl/expr_serial_evaluator.hpp 3 Mar 2006 14:30:53 -0000 1.5
+++ src/vsip/impl/expr_serial_evaluator.hpp 3 Apr 2006 20:30:22 -0000
@@ -1,4 +1,4 @@
-/* Copyright (c) 2005 by CodeSourcery, LLC. All rights reserved. */
+/* Copyright (c) 2005, 2006 by CodeSourcery, LLC. All rights reserved. */
/** @file vsip/impl/expr_evaluator.hpp
@author Stefan Seefeld
@@ -68,6 +68,9 @@
typename SrcBlock>
struct Serial_expr_evaluator<2, DstBlock, SrcBlock, Transpose_tag>
{
+ typedef typename DstBlock::value_type dst_value_type;
+ typedef typename SrcBlock::value_type src_value_type;
+
static bool const is_rhs_expr = Is_expr_block<SrcBlock>::value;
static bool const is_rhs_simple =
@@ -90,12 +93,115 @@
static bool const ct_valid =
!is_rhs_expr &&
lhs_cost == 0 && rhs_cost == 0 &&
- !is_lhs_split && !is_rhs_split &&
- !Type_equal<src_order_type, dst_order_type>::value;
+ !is_lhs_split && !is_rhs_split;
static bool rt_valid(DstBlock& /*dst*/, SrcBlock const& /*src*/)
{ return true; }
+ static void exec(DstBlock& dst, SrcBlock const& src, row2_type, row2_type)
+ {
+ vsip::impl::Ext_data<DstBlock> d_ext(dst, vsip::impl::SYNC_OUT);
+ vsip::impl::Ext_data<SrcBlock> s_ext(src, vsip::impl::SYNC_IN);
+
+ dst_value_type* d_ptr = d_ext.data();
+ src_value_type* s_ptr = s_ext.data();
+
+ stride_type d_stride_0 = d_ext.stride(0);
+ stride_type d_stride_1 = d_ext.stride(1);
+ stride_type s_stride_0 = s_ext.stride(0);
+ stride_type s_stride_1 = s_ext.stride(1);
+
+ length_type size_0 = d_ext.size(0);
+ length_type size_1 = d_ext.size(1);
+
+ assert(size_0 <= s_ext.size(0));
+ assert(size_1 <= s_ext.size(1));
+
+ if (Type_equal<dst_value_type, src_value_type>::value
+ && d_stride_1 == 1 && s_stride_1 == 1)
+ {
+ if (d_stride_0 == size_1 && s_stride_0 == size_1)
+ memcpy(d_ptr, s_ptr, size_0*size_1*sizeof(dst_value_type));
+ else
+ for (index_type i=0; i<size_0; ++i)
+ {
+ memcpy(d_ptr, s_ptr, size_1*sizeof(dst_value_type));
+ d_ptr += d_stride_0;
+ s_ptr += s_stride_0;
+ }
+ }
+ else
+ {
+ for (index_type i=0; i<size_0; ++i)
+ {
+ dst_value_type* d_row = d_ptr;
+ src_value_type* s_row = s_ptr;
+
+ for (index_type j=0; j<size_1; ++j)
+ {
+ *d_row = *s_row;
+ d_row += d_stride_1;
+ s_row += s_stride_1;
+ }
+
+ d_ptr += d_stride_0;
+ s_ptr += s_stride_0;
+ }
+ }
+ }
+
+ static void exec(DstBlock& dst, SrcBlock const& src, col2_type, col2_type)
+ {
+ vsip::impl::Ext_data<DstBlock> d_ext(dst, vsip::impl::SYNC_OUT);
+ vsip::impl::Ext_data<SrcBlock> s_ext(src, vsip::impl::SYNC_IN);
+
+ dst_value_type* d_ptr = d_ext.data();
+ src_value_type* s_ptr = s_ext.data();
+
+ stride_type d_stride_0 = d_ext.stride(0);
+ stride_type d_stride_1 = d_ext.stride(1);
+ stride_type s_stride_0 = s_ext.stride(0);
+ stride_type s_stride_1 = s_ext.stride(1);
+
+ length_type size_0 = d_ext.size(0);
+ length_type size_1 = d_ext.size(1);
+
+ assert(size_0 <= s_ext.size(0));
+ assert(size_1 <= s_ext.size(1));
+
+ if (Type_equal<dst_value_type, src_value_type>::value
+ && d_stride_0 == 1 && s_stride_0 == 1)
+ {
+ if (d_stride_1 == size_0 && s_stride_1 == size_0)
+ memcpy(d_ptr, s_ptr, size_0*size_1*sizeof(dst_value_type));
+ else
+ for (index_type j=0; j<size_1; ++j)
+ {
+ memcpy(d_ptr, s_ptr, size_0*sizeof(dst_value_type));
+ d_ptr += d_stride_1;
+ s_ptr += s_stride_1;
+ }
+ }
+ else
+ {
+ for (index_type j=0; j<size_1; ++j)
+ {
+ dst_value_type* d_row = d_ptr;
+ src_value_type* s_row = s_ptr;
+
+ for (index_type i=0; i<size_0; ++i)
+ {
+ *d_row = *s_row;
+ d_row += d_stride_0;
+ s_row += s_stride_0;
+ }
+
+ d_ptr += d_stride_1;
+ s_ptr += s_stride_1;
+ }
+ }
+ }
+
static void exec(DstBlock& dst, SrcBlock const& src, col2_type, row2_type)
{
vsip::impl::Ext_data<DstBlock> dst_ext(dst, vsip::impl::SYNC_OUT);
Index: src/vsip/impl/fast-transpose.hpp
===================================================================
RCS file: /home/cvs/Repository/vpp/src/vsip/impl/fast-transpose.hpp,v
retrieving revision 1.1
diff -u -r1.1 fast-transpose.hpp
--- src/vsip/impl/fast-transpose.hpp 22 Dec 2005 01:29:25 -0000 1.1
+++ src/vsip/impl/fast-transpose.hpp 3 Apr 2006 20:30:22 -0000
@@ -1,4 +1,4 @@
-/* Copyright (c) 2005 by CodeSourcery, LLC. All rights reserved. */
+/* Copyright (c) 2005, 2006 by CodeSourcery, LLC. All rights reserved. */
/** @file vsip/impl/fast-transpose.hpp
@author Jules Bergmann
@@ -11,6 +11,20 @@
#define VSIP_IMPL_FAST_TRANSPOSE_HPP
/***********************************************************************
+ Includes & Macros
+***********************************************************************/
+
+#if __SSE2__ && __amd64__
+# define VSIP_IMPL_TRANSPOSE_USE_3DNOW 1
+# include <mmintrin.h>
+#elif __SSE2__
+# define VSIP_IMPL_TRANSPOSE_USE_SSE 1
+# include <xmmintrin.h>
+#endif
+
+
+
+/***********************************************************************
Definitions
***********************************************************************/
@@ -20,7 +34,489 @@
namespace impl
{
-// Transpose for unit-strides.
+namespace trans_detail
+{
+
+inline void
+transpose_simd_start()
+{
+#if VSIP_IMPL_TRANSPOSE_USE_3DNOW
+ __asm__ __volatile__ ("femms");
+#endif
+}
+
+inline void
+transpose_simd_stop()
+{
+#if VSIP_IMPL_TRANSPOSE_USE_3DNOW
+ __asm__ __volatile__ ("femms");
+#endif
+}
+
+
+
+/***********************************************************************
+ General Unrolled_transpose definitions
+***********************************************************************/
+
+template <typename T1,
+ typename T2,
+ unsigned Block,
+ bool SimdAligned>
+struct Unrolled_transpose
+{
+ static void exec(T1* dst, T2 const* src,
+ unsigned const dst_col_stride,
+ unsigned const src_row_stride)
+ {
+ typedef Unrolled_transpose<T1, T2, Block/2, SimdAligned> meta;
+
+ meta::exec(dst, src,
+ dst_col_stride,
+ src_row_stride);
+ meta::exec(dst + (Block/2)*dst_col_stride, src + (Block/2),
+ dst_col_stride,
+ src_row_stride);
+ meta::exec(dst + (Block/2), src + (Block/2)*src_row_stride,
+ dst_col_stride,
+ src_row_stride);
+ meta::exec(dst + (Block/2) + (Block/2)*dst_col_stride,
+ src + (Block/2)*src_row_stride + (Block/2),
+ dst_col_stride,
+ src_row_stride);
+ }
+};
+
+
+
+template <typename T1,
+ typename T2,
+ bool SimdAligned>
+struct Unrolled_transpose<T1, T2, 1, SimdAligned>
+{
+ static void exec(T1* dst, T2 const* src,
+ unsigned const dst_col_stride,
+ unsigned const src_row_stride)
+ {
+ *dst = *src;
+ }
+};
+
+
+
+/***********************************************************************
+ SSE specific Unrolled_transpose definitions
+***********************************************************************/
+
+#if VSIP_IMPL_TRANSPOSE_USE_SSE
+// 4x4 float unaligned fragment for SSE.
+
+template <>
+struct Unrolled_transpose<float, float, 4, false>
+{
+ static void exec(
+ float* dst,
+ float const* src,
+ unsigned const dst_col_stride,
+ unsigned const src_row_stride)
+ {
+ __v4sf row0 = _mm_loadu_ps(src + 0*src_row_stride + 0);
+ __v4sf row1 = _mm_loadu_ps(src + 1*src_row_stride + 0);
+ __v4sf row2 = _mm_loadu_ps(src + 2*src_row_stride + 0);
+ __v4sf row3 = _mm_loadu_ps(src + 3*src_row_stride + 0);
+ _MM_TRANSPOSE4_PS(row0, row1, row2, row3);
+ _mm_storeu_ps(dst + 0 + 0*dst_col_stride, row0);
+ _mm_storeu_ps(dst + 0 + 1*dst_col_stride, row1);
+ _mm_storeu_ps(dst + 0 + 2*dst_col_stride, row2);
+ _mm_storeu_ps(dst + 0 + 3*dst_col_stride, row3);
+ }
+};
+
+
+
+// 4x4 float aligned fragment for SSE.
+
+template <>
+struct Unrolled_transpose<float, float, 4, true>
+{
+ static void exec(
+ float* dst,
+ float const* src,
+ unsigned const dst_col_stride,
+ unsigned const src_row_stride)
+ {
+ __v4sf row0 = _mm_load_ps(src + 0*src_row_stride + 0);
+ __v4sf row1 = _mm_load_ps(src + 1*src_row_stride + 0);
+ __v4sf row2 = _mm_load_ps(src + 2*src_row_stride + 0);
+ __v4sf row3 = _mm_load_ps(src + 3*src_row_stride + 0);
+ _MM_TRANSPOSE4_PS(row0, row1, row2, row3);
+ _mm_store_ps(dst + 0 + 0*dst_col_stride, row0);
+ _mm_store_ps(dst + 0 + 1*dst_col_stride, row1);
+ _mm_store_ps(dst + 0 + 2*dst_col_stride, row2);
+ _mm_store_ps(dst + 0 + 3*dst_col_stride, row3);
+ }
+};
+
+
+
+// 2x2 complex<float> unaligned fragment for SSE.
+
+template <>
+struct Unrolled_transpose<complex<float>, complex<float>, 2, false>
+{
+ static void exec(
+ complex<float>* dst,
+ complex<float>const * src,
+ unsigned const dst_col_stride,
+ unsigned const src_row_stride)
+ {
+ __v4sf row0 = _mm_loadu_ps(
+ reinterpret_cast<float const*>(src + 0*src_row_stride + 0));
+ __v4sf row1 = _mm_loadu_ps(
+ reinterpret_cast<float const*>(src + 1*src_row_stride + 0));
+
+ __v4sf col0 = __builtin_ia32_shufps(row0, row1, 0x44); // 10 00 01 00
+ __v4sf col1 = __builtin_ia32_shufps(row0, row1, 0xEE); // 11 10 11 10
+
+ _mm_storeu_ps(reinterpret_cast<float*>(dst + 0 + 0*dst_col_stride), col0);
+ _mm_storeu_ps(reinterpret_cast<float*>(dst + 0 + 1*dst_col_stride), col1);
+ }
+};
+
+
+
+// 2x2 complex<float> aligned fragment for SSE.
+
+template <>
+struct Unrolled_transpose<complex<float>, complex<float>, 2, true>
+{
+ static void exec(
+ complex<float>* dst,
+ complex<float>const * src,
+ unsigned const dst_col_stride,
+ unsigned const src_row_stride)
+ {
+ __v4sf row0 = _mm_load_ps(
+ reinterpret_cast<float const*>(src + 0*src_row_stride + 0));
+ __v4sf row1 = _mm_load_ps(
+ reinterpret_cast<float const*>(src + 1*src_row_stride + 0));
+
+ __v4sf col0 = __builtin_ia32_shufps(row0, row1, 0x44); // 10 00 01 00
+ __v4sf col1 = __builtin_ia32_shufps(row0, row1, 0xEE); // 11 10 11 10
+
+ _mm_store_ps(reinterpret_cast<float*>(dst + 0 + 0*dst_col_stride), col0);
+ _mm_store_ps(reinterpret_cast<float*>(dst + 0 + 1*dst_col_stride), col1);
+ }
+};
+#endif
+
+
+
+/***********************************************************************
+ 3DNow! specific Unrolled_transpose definitions
+***********************************************************************/
+
+#if VSIP_IMPL_TRANSPOSE_USE_3DNOW
+
+typedef float __v2sf __attribute__ ((__mode__ (__V2SF__),__aligned__(8)));
+
+template <bool SimdAligned>
+struct Unrolled_transpose<float, float, 2, SimdAligned>
+{
+ static void exec(
+ float* dst,
+ float const* src,
+ unsigned const dst_col_stride,
+ unsigned const src_row_stride)
+ {
+ __v2sf row0 = *(__v2sf*)(src + 0*src_row_stride + 0);
+ __v2sf row1 = *(__v2sf*)(src + 1*src_row_stride + 0);
+
+ __v2sf col0 = (__v2sf)_m_punpckldq((__m64)row0, (__m64)row1);
+ __v2sf col1 = (__v2sf)_m_punpckhdq((__m64)row0, (__m64)row1);
+
+ *(__v2sf*)(dst + 0 + 0*dst_col_stride) = col0;
+ *(__v2sf*)(dst + 0 + 1*dst_col_stride) = col1;
+ }
+};
+
+template <bool SimdAligned>
+struct Unrolled_transpose<complex<float>, complex<float>, 1, SimdAligned>
+{
+ static void exec(
+ complex<float>* dst,
+ complex<float> const* src,
+ unsigned const dst_col_stride,
+ unsigned const src_row_stride)
+ {
+ __v2sf row0 = *(__v2sf*)(src);
+
+ *(__v2sf*)(dst) = row0;
+ }
+};
+#endif
+
+
+
+/***********************************************************************
+ Definitions - transpose_unit
+***********************************************************************/
+
+// transpose_unit implementation tags.
+
+struct Impl_loop {};
+template <unsigned Block, bool SimdAligned> struct Impl_block_iter {};
+template <unsigned Block, bool SimdAligned> struct Impl_block_recur {};
+template <unsigned Block, bool SimdAligned> struct Impl_block_recur_helper {};
+struct Impl_recur {};
+
+template <typename T1,
+ typename T2>
+void
+transpose_unit(
+ T1* dst,
+ T2 const* src,
+ unsigned const rows, // dst rows
+ unsigned const cols, // dst cols
+ unsigned const dst_col_stride,
+ unsigned const src_row_stride,
+ Impl_loop)
+{
+ for (unsigned r=0; r<rows; ++r)
+ for (unsigned c=0; c<cols; ++c)
+ dst[r+c*dst_col_stride] = src[r*src_row_stride+c];
+}
+
+
+
+#if VSIP_IMPL_TRANSPOSE_USE_3DNOW
+template <>
+inline void
+transpose_unit(
+ complex<float>* dst,
+ complex<float> const* src,
+ unsigned const rows, // dst rows
+ unsigned const cols, // dst cols
+ unsigned const dst_col_stride,
+ unsigned const src_row_stride,
+ Impl_loop)
+{
+ __asm__ __volatile__ ("femms");
+ for (unsigned r=0; r<rows; ++r)
+ for (unsigned c=0; c<cols; ++c)
+ {
+ __v2sf row0 = *(__v2sf*)(src + r*src_row_stride+c);
+ *(__v2sf*)(dst + r+c*dst_col_stride) = row0;
+ }
+ __asm__ __volatile__ ("femms");
+}
+#endif
+
+
+
+/// Blocked transpose, using iteration over blocks.
+
+template <typename T1,
+ typename T2,
+ unsigned Block,
+ bool SimdAligned>
+void
+transpose_unit(
+ T1* dst,
+ T2 const* src,
+ unsigned const rows, // dst rows
+ unsigned const cols, // dst cols
+ unsigned const dst_col_stride,
+ unsigned const src_row_stride,
+ Impl_block_iter<Block, SimdAligned>)
+{
+ unsigned full_cols = cols - cols%Block;
+ unsigned full_rows = rows - rows%Block;
+
+ // Transpose core of matrix using Unrolled_transpose a block
+ // at a time.
+
+ for (unsigned r=0; r<full_rows; r += Block)
+ for (unsigned c=0; c<full_cols; c += Block)
+ {
+ transpose_simd_start();
+ Unrolled_transpose<T1, T2, Block, SimdAligned>::exec(
+ dst + r + c*dst_col_stride,
+ src + r*src_row_stride + c,
+ dst_col_stride,
+ src_row_stride);
+ transpose_simd_stop();
+ }
+
+
+ // Cleanup edges of matrix using Impl_loop.
+
+ if (full_cols != cols)
+ {
+ unsigned extra_cols = cols - full_cols;
+ for (unsigned r=0; r<full_rows; r += Block)
+ transpose_unit(dst + r + full_cols*dst_col_stride,
+ src + r*src_row_stride + full_cols,
+ Block, extra_cols,
+ dst_col_stride,
+ src_row_stride,
+ Impl_loop());
+ }
+ if (full_rows != rows)
+ {
+ unsigned extra_rows = rows - full_rows;
+ for (unsigned c=0; c<full_cols; c += Block)
+ transpose_unit(dst + full_rows + c*dst_col_stride,
+ src + full_rows*src_row_stride + c,
+ extra_rows, Block,
+ dst_col_stride,
+ src_row_stride,
+ Impl_loop());
+ if (full_cols != cols)
+ {
+ transpose_unit(dst + full_rows + full_cols*dst_col_stride,
+ src + full_rows*src_row_stride + full_cols,
+ extra_rows, cols - full_cols,
+ dst_col_stride,
+ src_row_stride,
+ Impl_loop());
+ }
+ }
+}
+
+
+
+/// Recurcive blocked transpose helper function.
+
+/// This routine performs the recursive sub-division for Impl_block_recur.
+
+template <typename T1,
+ typename T2,
+ unsigned Block,
+ bool SimdAligned>
+void
+transpose_unit(
+ T1* dst,
+ T2 const* src,
+ unsigned const rows, // dst rows
+ unsigned const cols, // dst cols
+ unsigned const dst_col_stride,
+ unsigned const src_row_stride,
+ Impl_block_recur_helper<Block, SimdAligned>)
+{
+ unsigned const thresh = 4*Block;
+ if (rows <= thresh && cols <= thresh)
+ {
+ for (unsigned r=0; r<rows; r+=Block)
+ for (unsigned c=0; c<cols; c+=Block)
+ Unrolled_transpose<T1, T2, Block, SimdAligned>::exec(
+ dst + r + c*dst_col_stride,
+ src + r*src_row_stride + c,
+ dst_col_stride,
+ src_row_stride);
+ }
+ else if (cols >= rows)
+ {
+ unsigned cols1 = ((cols/Block)/2) * Block;
+ unsigned cols2 = cols - cols1;
+
+ transpose_unit(dst, src,
+ rows, cols1,
+ dst_col_stride, src_row_stride,
+ Impl_block_recur_helper<Block, SimdAligned>());
+
+ transpose_unit(dst + (cols1)*dst_col_stride, src + (cols1),
+ rows, cols2,
+ dst_col_stride, src_row_stride,
+ Impl_block_recur_helper<Block, SimdAligned>());
+ }
+ else
+ {
+ unsigned rows1 = ((rows/Block)/2) * Block;
+ unsigned rows2 = rows - rows1;
+
+ transpose_unit(dst, src,
+ rows1, cols,
+ dst_col_stride, src_row_stride,
+ Impl_block_recur_helper<Block, SimdAligned>());
+
+ transpose_unit(dst + (rows1), src + (rows1)*src_row_stride,
+ rows2, cols,
+ dst_col_stride, src_row_stride,
+ Impl_block_recur_helper<Block, SimdAligned>());
+ }
+}
+
+
+
+/// Blocked transpose, using recursion over blocks.
+
+template <typename T1,
+ typename T2,
+ unsigned Block,
+ bool SimdAligned>
+void
+transpose_unit(
+ T1* dst,
+ T2 const* src,
+ unsigned const rows, // dst rows
+ unsigned const cols, // dst cols
+ unsigned const dst_col_stride,
+ unsigned const src_row_stride,
+ Impl_block_recur<Block, SimdAligned>)
+{
+ unsigned full_cols = cols - cols%Block;
+ unsigned full_rows = rows - rows%Block;
+
+ // Transpose core of matrix using Unrolled_transpose a block
+ // at a time.
+
+ transpose_simd_start();
+ transpose_unit(dst, src, full_rows, full_cols,
+ dst_col_stride,
+ src_row_stride,
+ Impl_block_recur_helper<Block, SimdAligned>());
+ transpose_simd_stop();
+
+
+ // Cleanup edges of matrix using Impl_loop.
+
+ if (full_cols != cols)
+ {
+ unsigned extra_cols = cols - full_cols;
+ for (unsigned r=0; r<full_rows; r += Block)
+ transpose_unit(dst + r + full_cols*dst_col_stride,
+ src + r*src_row_stride + full_cols,
+ Block, extra_cols,
+ dst_col_stride,
+ src_row_stride,
+ Impl_loop());
+ }
+ if (full_rows != rows)
+ {
+ unsigned extra_rows = rows - full_rows;
+ for (unsigned c=0; c<full_cols; c += Block)
+ transpose_unit(dst + full_rows + c*dst_col_stride,
+ src + full_rows*src_row_stride + c,
+ extra_rows, Block,
+ dst_col_stride,
+ src_row_stride,
+ Impl_loop());
+ if (full_cols != cols)
+ {
+ transpose_unit(dst + full_rows + full_cols*dst_col_stride,
+ src + full_rows*src_row_stride + full_cols,
+ extra_rows, cols - full_cols,
+ dst_col_stride,
+ src_row_stride,
+ Impl_loop());
+ }
+ }
+}
+
+
+
+// Recurively decomposed transposition for unit-strides.
// Algorithm based on "Cache-Oblivious Algorithms (Extended Abstract)"
// by M. Frigo, C. Leiseron, H. Prokop, S. Ramachandran.
@@ -30,11 +526,13 @@
typename T2>
void
transpose_unit(
- T1* dst, T2* src,
+ T1* dst,
+ T2 const* src,
unsigned const rows, // dst rows
unsigned const cols, // dst cols
unsigned const dst_col_stride,
- unsigned const src_row_stride)
+ unsigned const src_row_stride,
+ Impl_recur)
{
unsigned const thresh = 16;
if (rows <= thresh && cols <= thresh)
@@ -65,6 +563,59 @@
}
}
+} // namespace vsip::impl::trans_detail
+
+
+// Unit-stride transpose dispatch function.
+
+template <typename T1,
+ typename T2>
+inline void
+transpose_unit(
+ T1* dst,
+ T2 const* src,
+ unsigned const rows, // dst rows
+ unsigned const cols, // dst cols
+ unsigned const dst_col_stride,
+ unsigned const src_row_stride)
+{
+ // Check if data is aligned for SSE ISA.
+ // - pointers need to be 16-byte aligned
+ // - rows/cols need to start with 16-byte alignment.
+ bool aligned =
+ (((unsigned long)dst ) & 0xf) == 0 &&
+ (((unsigned long)src ) & 0xf) == 0 &&
+ (((unsigned long)dst_col_stride*sizeof(T1)) & 0xf) == 0 &&
+ (((unsigned long)src_row_stride*sizeof(T1)) & 0xf) == 0;
+
+ if (rows < 16 || cols < 16)
+ {
+ if (aligned)
+ trans_detail::transpose_unit(dst, src, rows, cols,
+ dst_col_stride, src_row_stride,
+ trans_detail::Impl_block_recur<4, true>()
+ );
+ else
+ trans_detail::transpose_unit(dst, src, rows, cols,
+ dst_col_stride, src_row_stride,
+ trans_detail::Impl_block_recur<4, false>()
+ );
+ }
+ else
+ {
+ if (aligned)
+ trans_detail::transpose_unit(dst, src, rows, cols,
+ dst_col_stride, src_row_stride,
+ trans_detail::Impl_block_recur<16, true>()
+ );
+ else
+ trans_detail::transpose_unit(dst, src, rows, cols,
+ dst_col_stride, src_row_stride,
+ trans_detail::Impl_block_recur<16, false>()
+ );
+ }
+}
+
// Transpose for matrices with arbitrary strides.
@@ -77,7 +628,8 @@
typename T2>
void
transpose(
- T1* dst, T2* src,
+ T1* dst,
+ T2 const* src,
unsigned const rows, // dst rows
unsigned const cols, // dst cols
unsigned const dst_stride0,