flipcode - Unrolling Loops With Meta-Programming

//============================================================================]
// The two following templated classes contain a static member function (doOperation)
// performing assignation operation between two values(= or +=).
// so T_lhs and T_rhs can be float, const float, double, a Complex class or whatever.
//============================================================================]
struct AssignOpAssign
{
  template <typename T_lhs, typename T_rhs
  inline static void doOperation(T_lhs & lhs, T_rhs & rhs) { lhs = rhs; }
};

struct AssignOpAdd
{
  template <typename T_lhs, typename T_rhs
  inline static void doOperation(T_lhs & lhs, T_rhs & rhs) { lhs += rhs; }
};

//============================================================================]
// The AssignOpLoopUnroller class contains the code performing the loop unrolling
// using the template recursion principle.
// This principe is simple: a template with an integer parameter contains a static
// function exec. Loop<N::exec calls Loop<N-1::exec
// Explicit specialisation is used to stop the recursion: Loop<0::exec() does nothing.
//============================================================================]
template <typename T_lhs, typename T_rhs, typename T_operator
struct AssignOpLoopUnroller
{
  template <int N
  struct Loop
  {
    inline static void exec(T_lhs * aLArray, T_rhs * aRArray)
    {
      T_operator::doOperation(aLArray[N - 1], aRArray[N - 1]);
      Loop<N - 1::exec(aLArray, aRArray);
    }
  };

  struct Loop<0
  {
    static inline void exec(T_lhs * aLArray, T_rhs * aRArray) { }
  };
};

/**
 *  A toy vector class illustrating the use of this "unrolling metaprogram"
 *  Of course, other kinds of unrollers would have to be developed for other kind
 *  of operations (like a ConstantAssignOpLoopUnroller).
 */
template <typename T, int N
struct Vector
{
  T       & operator [] (int i) { return m_aValues[i]; }

  const T & operator [] (int i) const { return m_aValues[i]; }

  Vector  & operator += (const Vector & rhs)
  {
    AssignOpLoopUnroller<T, const T, AssignOpAdd ::Loop<N::exec(m_aValues, rhs.m_aValues);
    return *this;
  }

  Vector  & operator = (const Vector & rhs)
  {
    AssignOpLoopUnroller<T, const T, AssignOpAssign ::Loop<N::exec(m_aValues, rhs.m_aValues);
    return *this;
  }

  const Vector operator + (const Vector & rhs) const { return Vector(*this) += rhs; }

protected:
  T   m_aValues[N];
};

//============================================================================]
// Some code using this class.
// Amazing, it works! :)
//============================================================================]

# include <iostream

using namespace std;

typedef Vector<float, 4 Vect4;

void foo(Vect4 & lhs, Vect4 &rhs) { lhs += rhs; }

void bar(Vect4 & lhs, Vect4 &rhs) {  lhs = rhs; }

void main()
{
  Vect4 vect1;
  Vect4 vect2;

  vect1[0] = 1;  vect1[1] = 7;  vect1[2] = 3;  vect1[3] = 11;
  vect2[0] = 41; vect2[1] = 35; vect2[2] = 39; vect2[3] = 31;
  foo(vect2, vect1);
  bar(vect1, vect2);

  cout << "vect1 =";
  for (int i = 0; i < 4; ++i)
    cout << " " << vect1[i];
  cout << endl;
}

//============================================================================]
// Samples from the assembly listing generated by MSVC++ :
//============================================================================]
/*

?foo@@YIXAAU?$Vector@M$03@@0@Z PROC NEAR		; foo, COMDAT

; 92   :   lhs += rhs;

	fld	DWORD PTR [edx+12]
	fadd	DWORD PTR [ecx+12]
	fstp	DWORD PTR [ecx+12]
	fld	DWORD PTR [edx+8]
	fadd	DWORD PTR [ecx+8]
	fstp	DWORD PTR [ecx+8]
	fld	DWORD PTR [edx+4]
	fadd	DWORD PTR [ecx+4]
	fstp	DWORD PTR [ecx+4]
	fld	DWORD PTR [ecx]
	fadd	DWORD PTR [edx]
	fstp	DWORD PTR [ecx]

; 93   : }

	ret	0
?foo@@YIXAAU?$Vector@M$03@@0@Z ENDP			; foo



 PUBLIC	?bar@@YIXAAU?$Vector@M$03@@0@Z			; bar
;	COMDAT ?bar@@YIXAAU?$Vector@M$03@@0@Z
_TEXT	SEGMENT
?bar@@YIXAAU?$Vector@M$03@@0@Z PROC NEAR		; bar, COMDAT

; 97   :   lhs = rhs;

	mov	eax, DWORD PTR [edx+12]
	mov	DWORD PTR [ecx+12], eax
	mov	eax, DWORD PTR [edx+8]
	mov	DWORD PTR [ecx+8], eax
	mov	eax, DWORD PTR [edx+4]
	mov	DWORD PTR [ecx+4], eax
	mov	edx, DWORD PTR [edx]
	mov	DWORD PTR [ecx], edx

; 98   : }

	ret	0
?bar@@YIXAAU?$Vector@M$03@@0@Z ENDP			; bar
*/

/*
 The assembly code shows that as expected, the Vect4 operations are inlined and
 the loops are unrolled!
 The VCPP compiler inlining depth will limit this unrolling. By default,
 inline_depth == 8
 You can increase this value up to 255.
 */