//============================================================================]
// The two following templated classes contain a static member function (doOperation)
// performing assignation operation between two values(= or +=).
// so T_lhs and T_rhs can be float, const float, double, a Complex class or whatever.
//============================================================================]
struct AssignOpAssign
{
  template <typename T_lhs, typename T_rhs
  inline static void doOperation(T_lhs & lhs, T_rhs & rhs) { lhs = rhs; }
};
  struct AssignOpAdd
{
  template <typename T_lhs, typename T_rhs
  inline static void doOperation(T_lhs & lhs, T_rhs & rhs) { lhs += rhs; }
};
  //============================================================================]
// The AssignOpLoopUnroller class contains the code performing the loop unrolling
// using the template recursion principle.
// This principe is simple: a template with an integer parameter contains a static
// function exec. Loop<N::exec calls Loop<N-1::exec
// Explicit specialisation is used to stop the recursion: Loop<0::exec() does nothing.
//============================================================================]
template <typename T_lhs, typename T_rhs, typename T_operator
struct AssignOpLoopUnroller
{
  template <int N
  struct Loop
  {
    inline static void exec(T_lhs * aLArray, T_rhs * aRArray)
    {
      T_operator::doOperation(aLArray[N - 1], aRArray[N - 1]);
      Loop<N - 1::exec(aLArray, aRArray);
    }
  };
    struct Loop<0
  {
    static inline void exec(T_lhs * aLArray, T_rhs * aRArray) { }
  };
};
  /**
 *  A toy vector class illustrating the use of this "unrolling metaprogram"
 *  Of course, other kinds of unrollers would have to be developed for other kind
 *  of operations (like a ConstantAssignOpLoopUnroller).
 */
template <typename T, int N
struct Vector
{
  T       & operator [] (int i) { return m_aValues[i]; }
    const T & operator [] (int i) const { return m_aValues[i]; }
    Vector  & operator += (const Vector & rhs)
  {
    AssignOpLoopUnroller<T, const T, AssignOpAdd ::Loop<N::exec(m_aValues, rhs.m_aValues);
    return *this;
  }
    Vector  & operator = (const Vector & rhs)
  {
    AssignOpLoopUnroller<T, const T, AssignOpAssign ::Loop<N::exec(m_aValues, rhs.m_aValues);
    return *this;
  }
    const Vector operator + (const Vector & rhs) const { return Vector(*this) += rhs; }
  protected:
  T   m_aValues[N];
};
  //============================================================================]
// Some code using this class.
// Amazing, it works! :)
//============================================================================]
# include <iostream
  using namespace std;
  typedef Vector<float, 4 Vect4;
  void foo(Vect4 & lhs, Vect4 &rhs) { lhs += rhs; }
  void bar(Vect4 & lhs, Vect4 &rhs) {  lhs = rhs; }
  void main()
{
  Vect4 vect1;
  Vect4 vect2;
    vect1[0] = 1;  vect1[1] = 7;  vect1[2] = 3;  vect1[3] = 11;
  vect2[0] = 41; vect2[1] = 35; vect2[2] = 39; vect2[3] = 31;
  foo(vect2, vect1);
  bar(vect1, vect2);
    cout << "vect1 =";
  for (int i = 0; i < 4; ++i)
    cout << " " << vect1[i];
  cout << endl;
}
  //============================================================================]
// Samples from the assembly listing generated by MSVC++ :
//============================================================================]
/*
  ?foo@@YIXAAU?$Vector@M$03@@0@Z PROC NEAR		; foo, COMDAT
  ; 92   :   lhs += rhs;
  	fld	DWORD PTR [edx+12]
	fadd	DWORD PTR [ecx+12]
	fstp	DWORD PTR [ecx+12]
	fld	DWORD PTR [edx+8]
	fadd	DWORD PTR [ecx+8]
	fstp	DWORD PTR [ecx+8]
	fld	DWORD PTR [edx+4]
	fadd	DWORD PTR [ecx+4]
	fstp	DWORD PTR [ecx+4]
	fld	DWORD PTR [ecx]
	fadd	DWORD PTR [edx]
	fstp	DWORD PTR [ecx]
  ; 93   : }
  	ret	0
?foo@@YIXAAU?$Vector@M$03@@0@Z ENDP			; foo
 
 
   PUBLIC	?bar@@YIXAAU?$Vector@M$03@@0@Z			; bar
;	COMDAT ?bar@@YIXAAU?$Vector@M$03@@0@Z
_TEXT	SEGMENT
?bar@@YIXAAU?$Vector@M$03@@0@Z PROC NEAR		; bar, COMDAT
  ; 97   :   lhs = rhs;
  	mov	eax, DWORD PTR [edx+12]
	mov	DWORD PTR [ecx+12], eax
	mov	eax, DWORD PTR [edx+8]
	mov	DWORD PTR [ecx+8], eax
	mov	eax, DWORD PTR [edx+4]
	mov	DWORD PTR [ecx+4], eax
	mov	edx, DWORD PTR [edx]
	mov	DWORD PTR [ecx], edx
  ; 98   : }
  	ret	0
?bar@@YIXAAU?$Vector@M$03@@0@Z ENDP			; bar
*/
  /*
 The assembly code shows that as expected, the Vect4 operations are inlined and
 the loops are unrolled!
 The VCPP compiler inlining depth will limit this unrolling. By default,
 inline_depth == 8
 You can increase this value up to 255.
 */  |