/* +---------------------------------------------------------------------------+
   |          The Mobile Robot Programming Toolkit (MRPT) C++ library          |
   |                                                                           |
   |                   http://mrpt.sourceforge.net/                            |
   |                                                                           |
   |   Copyright (C) 2005-2009  University of Malaga                           |
   |                                                                           |
   |    This software was written by the Machine Perception and Intelligent    |
   |      Robotics Lab, University of Malaga (Spain).                          |
   |    Contact: Jose-Luis Blanco  <jlblanco@ctima.uma.es>                     |
   |                                                                           |
   |  This file is part of the MRPT project.                                   |
   |                                                                           |
   |     MRPT is free software: you can redistribute it and/or modify          |
   |     it under the terms of the GNU General Public License as published by  |
   |     the Free Software Foundation, either version 3 of the License, or     |
   |     (at your option) any later version.                                   |
   |                                                                           |
   |   MRPT is distributed in the hope that it will be useful,                 |
   |     but WITHOUT ANY WARRANTY; without even the implied warranty of        |
   |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         |
   |     GNU General Public License for more details.                          |
   |                                                                           |
   |     You should have received a copy of the GNU General Public License     |
   |     along with MRPT.  If not, see <http://www.gnu.org/licenses/>.         |
   |                                                                           |
   +---------------------------------------------------------------------------+ */
#ifndef CMatrixFixedNumeric_H
#define CMatrixFixedNumeric_H

#include <mrpt/math/CMatrix.h>
#include <mrpt/math/CMatrixD.h>

namespace mrpt
{
	namespace math
	{
		using namespace mrpt::system;
		using namespace mrpt::poses;

		// Forward declarations ----------------
		template <typename T, size_t N, size_t M> void  multiply_HCHt(const CMatrixFixedNumeric<T,N,M> &H,const CMatrixFixedNumeric<T,M,M> &C,CMatrixFixedNumeric<T,N,N>   &R,bool accumResultInOutput = false  );
		template <typename T,size_t NROWS,size_t NCOLS> void  invMatrix( const CMatrixFixedNumeric<T,NROWS,NCOLS> &M, CMatrixFixedNumeric<T,NROWS,NCOLS> &out_inv );
		template <typename T,size_t NROWS,size_t NCOLS> void  invMatrix_destroySrc( CMatrixFixedNumeric<T,NROWS,NCOLS> &M, CMatrixFixedNumeric<T,NROWS,NCOLS> &out_inv );
		template <typename T,size_t NROWS,size_t NCOLS> void  multiply(CMatrixFixedNumeric<T,NROWS,NCOLS>& m,const T val);
		template <typename T,size_t NROWS,size_t NCOLS,size_t M1C> void  multiply(const CMatrixFixedNumeric<T,NROWS,M1C>& m1,const CMatrixFixedNumeric<T,M1C,NCOLS>& m2,CMatrixFixedNumeric<T,NROWS,NCOLS>& RESULT );
		template <typename T,size_t NROWS,size_t NCOLS> void  multiply_SIMD(CMatrixFixedNumeric<T,NROWS,NCOLS>& m,const T val);
		template <typename T,size_t NROWS,size_t NCOLS,size_t M1C> void  multiply_SIMD(const CMatrixFixedNumeric<T,NROWS,M1C>& m1,const CMatrixFixedNumeric<T,M1C,NCOLS>& m2,CMatrixFixedNumeric<T,NROWS,NCOLS>& RESULT );
		template <typename T,size_t M1R,size_t M1C> void  multiply_AAt(const CMatrixFixedNumeric<T,M1R,M1C>& m1,CMatrixFixedNumeric<T,M1R,M1R>& RESULT );
		template <typename T,size_t N,size_t M> void  multiply_Ab( const CMatrixFixedNumeric<T,N,M>& A, const std::vector<T>& a, std::vector<T>& out_v );
		template <typename T,size_t NROWS,size_t NCOLS> void  sumInPlace(CMatrixFixedNumeric<T,NROWS,NCOLS>& m,const T val);
		template <typename T,size_t NROWS,size_t NCOLS> void  sumInPlace_SIMD(CMatrixFixedNumeric<T,NROWS,NCOLS>& m,const T val);
		template <typename T,size_t NROWS,size_t NCOLS> void  sumInPlace(CMatrixFixedNumeric<T,NROWS,NCOLS>& M,const CMatrixFixedNumeric<T,NROWS,NCOLS>& A);
		template <typename T,size_t NROWS,size_t NCOLS> void  sumInPlace_SIMD(CMatrixFixedNumeric<T,NROWS,NCOLS>& M,const CMatrixFixedNumeric<T,NROWS,NCOLS>& A);
		template <typename T,size_t NROWS,size_t NCOLS> void  substractInPlace(CMatrixFixedNumeric<T,NROWS,NCOLS>& M,const CMatrixFixedNumeric<T,NROWS,NCOLS>& A);
		template <typename T,size_t NROWS,size_t NCOLS> void  substractInPlace_SIMD(CMatrixFixedNumeric<T,NROWS,NCOLS>& M,const CMatrixFixedNumeric<T,NROWS,NCOLS>& A);
		template <typename T,size_t NROWS,size_t NCOLS> T  sumMatrixAllElements( const CMatrixFixedNumeric<T,NROWS,NCOLS>& M );
		template <typename T,size_t NROWS,size_t NCOLS> T  sumMatrixAllElements_SIMD( const CMatrixFixedNumeric<T,NROWS,NCOLS>& M );
		template <typename T,size_t NROWS,size_t NCOLS> T  minimumMatrix(const CMatrixFixedNumeric<T,NROWS,NCOLS>& M);
		template <typename T,size_t NROWS,size_t NCOLS> T  minimumMatrix_SIMD(const CMatrixFixedNumeric<T,NROWS,NCOLS>& M);
		template <typename T,size_t NROWS,size_t NCOLS> T  maximumMatrix(const CMatrixFixedNumeric<T,NROWS,NCOLS>& M);
		template <typename T,size_t NROWS,size_t NCOLS> T  maximumMatrix_SIMD(const CMatrixFixedNumeric<T,NROWS,NCOLS>& M);
		template <typename T,size_t NROWS,size_t NCOLS> void  minimumAndMaximumMatrix(const CMatrixFixedNumeric<T,NROWS,NCOLS>& M, T &val_min, T &val_max);
		template <typename T,size_t NROWS,size_t NCOLS> void  minimumAndMaximumMatrix_SIMD(const CMatrixFixedNumeric<T,NROWS,NCOLS>& M, T &val_min, T &val_max);
		template <typename T,size_t NROWS,size_t NCOLS> T  detMatrix(const CMatrixFixedNumeric<T,NROWS,NCOLS>& M);
		template <typename T,size_t NROWS,size_t NCOLS> void  sqrtMatrix(CMatrixFixedNumeric<T,NROWS,NCOLS>& M);
		template <typename T,size_t N> void  eigenVectorsMatrix(const CMatrixFixedNumeric<T,N,N> &M,CMatrixFixedNumeric<T,N,N> &Z,CMatrixFixedNumeric<T,N,N> &D );
		template <typename T, size_t N, size_t M> void  multiply_HtCH(const CMatrixFixedNumeric<T,M,N> &H,
			const CMatrixFixedNumeric<T,M,M> &C,CMatrixFixedNumeric<T,N,N>   &R,bool accumResultInOutput = false  );

		// End of forward declarations ----------------


		/**  A numeric matrix of compile-time fixed size.
		 *   The template can be instanced for data types "float" or "double"
		 *   Virtually all methods have specializations and/or SSE2 optimized implementations, so use this class when time is critical.
		 *
		 * \note To enable SSE2 optimizations, add the definition "#define MRPT_USE_SSE2" BEFORE including MRPT headers in your code. This is because these optimizations are only applicable to static matrix objects, but not when they are created in dynamic memory.
		 *
		 * \sa CMatrixTemplateNumeric (for dynamic-size matrices)
		 */
		template <typename T,size_t NROWS,size_t NCOLS>
		class CMatrixFixedNumeric    // Must have no "MRPTDLLIMPEXP"
		{
		public:
			typedef T value_type;	//!< The type of the matrix elements

			/** The stored data of the matrix: elements are saved by rows, left to right, from top to bottom. */
#if MRPT_HAS_SSE2 && defined(MRPT_USE_SSE2)
			MRPT_ALIGN16
#endif
			T	m_Val[ NROWS * NCOLS ];

		public:
			/** Default constructor, fills the whole matrix with zeros */
			CMatrixFixedNumeric() {
#if defined(_DEBUG) && MRPT_HAS_SSE2 && defined(MRPT_USE_SSE2)
				if ((uintptr_t(m_Val) & 0x0f) != 0 )
					THROW_EXCEPTION("16-unaligned memory!")
#endif
				::memset(m_Val,0,sizeof(m_Val));
			}

			/** Constructor which leaves the matrix uninitialized: it uses two bool arguments with ignored values, but they must be present to make the method signature distinctive and make sure that the user wants the matrix to be uninitialized (ie, leaving only one bool argument may lead to unintended conversions from bool values!) */
			CMatrixFixedNumeric(bool ,bool ) {
			}

			/** Constructor from a given size and a C array. The array length must match cols x row.
			  * \code
			  *  const double numbers[] = {
			  *    1,2,3,
			  *    4,5,6 };
			  *	 CMatrixFixedNumeric<double,3,2>    M(numbers);
			  * \endcode
			  */
			template <typename V, size_t N>
			CMatrixFixedNumeric ( V (&theArray)[N] )
			{
				MRPT_COMPILE_TIME_ASSERT(N!=0)
				MRPT_COMPILE_TIME_ASSERT(N==NROWS * NCOLS)
				if (sizeof(V)==sizeof(T))
					::memcpy(m_Val,theArray,sizeof(m_Val));
				else
				for (size_t i=0;i<N;i++)
					m_Val[i] = static_cast<T>(theArray[i]);
			}

			/** Copy constructor from another matrix of a different size: it's explicit so matrices of different sizes are not mixed by mistake.
			  */
			template <size_t N,size_t M>
			explicit CMatrixFixedNumeric(const CMatrixFixedNumeric<T,N,M> &B)
			{
				::memset(m_Val,0,sizeof(m_Val));
				const size_t nr = std::min(NROWS,N);
				const size_t nc = std::min(NCOLS,M);
				for (size_t r=0;r<nr;r++)
					::memcpy(m_Val+NCOLS*r, B.m_Val+M*r, sizeof(T)*nc );
			}

			/** Copy constructor from another matrix of a different type: it's explicit so matrices of different types are not mixed by mistake.
			  */
			template <typename R>
			explicit CMatrixFixedNumeric(const CMatrixFixedNumeric<R,NROWS,NCOLS> &B)
			{
				for (size_t r=0;r<NROWS;r++)
					for (size_t c=0;c<NCOLS;c++)
						get_unsafe(r,c) = static_cast<T>( B.get_unsafe(r,c) );
			}

			/** Copy constructor from a dynamic-size matrix
			  *  An exception will be raised if the sizes do not match, unless "clipToFixedMatrixSize" is true.
			 */
			template <typename R>
			explicit CMatrixFixedNumeric(const CMatrixTemplate<R> &B, bool clipToFixedMatrixSize = false )
			{
				if (!clipToFixedMatrixSize) {
					*this = B;
				}
				else
				{
					::memset(m_Val,0,sizeof(m_Val));
					const size_t nr = std::min(NROWS,B.getRowCount());
					const size_t nc = std::min(NCOLS,B.getColCount());
					for (size_t r=0;r<nr;r++)
						for (size_t c=0;c<nc;c++)
							get_unsafe(r,c) = B.get_unsafe(r,c);
				}
			}

			/** Conversion from a dynamic-size matrix to a fixed-size one.
			  * \exception std::exception On wrong sizes
			  */
			CMatrixFixedNumeric<T,NROWS,NCOLS>& operator =(const CMatrixTemplate<T> &B)
			{
				ASSERT_( NROWS==B.getRowCount() )
				ASSERT_( NCOLS==B.getColCount() )
				for (size_t r=0;r<NROWS;r++)
					::memcpy(m_Val+r*NCOLS, B.get_unsafe_row(r), sizeof(T)*NCOLS);
				return *this;
			}

			/** Conversion from a dynamic-size matrix of a different data type to a fixed-size one.
			  * \exception std::exception On wrong sizes
			  */
			template <typename R>
			CMatrixFixedNumeric<T,NROWS,NCOLS>& operator =(const CMatrixTemplate<R> &B)
			{
				ASSERT_( NROWS==B.getRowCount() )
				ASSERT_( NCOLS==B.getColCount() )
				for (size_t r=0;r<NROWS;r++)
					for (size_t c=0;c<NCOLS;c++)
						get_unsafe(r,c) = B.get_unsafe(r,c);
				return *this;
			}

			/** Get number of rows */
			static size_t getRowCount() {
				return NROWS;
			}
			/** Get number of columns */
			static size_t getColCount() {
				return NCOLS;
			}

			/** Assigns a pose to a 2x1 or 1x2 matrix */
			CMatrixFixedNumeric(const CPoint2D &p) { matrixFromPoseOrPoint(*this,p); }
			/** Assigns a pose to a 3x1 or 1x3 matrix */
			CMatrixFixedNumeric(const CPoint3D &p) { matrixFromPoseOrPoint(*this,p); }
			/** Assigns a pose to a 3x1 or 1x3 matrix */
			CMatrixFixedNumeric(const CPose2D &p) { matrixFromPoseOrPoint(*this,p); }
			/** Assigns a pose to a 6x1 or 1x6 matrix */
			CMatrixFixedNumeric(const CPose3D &p) { matrixFromPoseOrPoint(*this,p); }

			/** Assigns a pose to a 2x1 or 1x2 matrix */
			CMatrixFixedNumeric<T,NROWS,NCOLS> & operator = (const CPoint2D &p) {
				return matrixFromPoseOrPoint(*this,p);
			}
			/** Assigns a pose to a 3x1 or 1x3 matrix */
			CMatrixFixedNumeric<T,NROWS,NCOLS> & operator = (const CPoint3D &p) {
				return matrixFromPoseOrPoint(*this,p);
			}
			/** Assigns a pose to a 3x1 or 1x3 matrix */
			CMatrixFixedNumeric<T,NROWS,NCOLS> & operator = (const CPose2D &p) {
				return matrixFromPoseOrPoint(*this,p);
			}
			/** Assigns a pose to a 6x1 or 1x6 matrix */
			CMatrixFixedNumeric<T,NROWS,NCOLS> & operator = (const CPose3D &p) {
				return matrixFromPoseOrPoint(*this,p);
			}

			/** Make the matrix an identity matrix */
			void unit() {
				::memset(m_Val,0,sizeof(m_Val));
				for (size_t i=0;i<NROWS * NCOLS;i+=(NROWS+1))
					m_Val[i] = 1;
			}

			/** Set all elements to zero */
			void zeros() {
				::memset(m_Val,0,sizeof(m_Val));
			}

			/** Read-only access to one element (Use with caution, bounds are not checked!) */
			T get_unsafe(const size_t row, const size_t col) const {
				return m_Val[NCOLS*row+col];
			}

			/** Reference access to one element (Use with caution, bounds are not checked!) */
			T& get_unsafe(const size_t row, const size_t col) {
				return m_Val[NCOLS*row+col];
			}

			/** Sets an element  (Use with caution, bounds are not checked!) */
			void set_unsafe(const size_t row, const size_t col, const T val) {
				m_Val[NCOLS*row+col] = val;
			}

			/** Subscript operator to get/set individual elements
				*/
			inline T& operator () (const size_t row, const size_t col)
			{
		#if defined(_DEBUG) || (MRPT_ALWAYS_CHECKS_DEBUG_MATRICES)
				if (row >= NROWS || col >= NCOLS)
					THROW_EXCEPTION( format("Indexes (%lu,%lu) out of range. Matrix is %lux%lu",static_cast<unsigned long>(row),static_cast<unsigned long>(col),static_cast<unsigned long>(NROWS),static_cast<unsigned long>(NCOLS)) );
		#endif
				return m_Val[NCOLS*row+col];
			}

			/** Subscript operator to get/set individual elements
				*/
			inline T operator () (const size_t row, const size_t col) const
			{
		#if defined(_DEBUG) || (MRPT_ALWAYS_CHECKS_DEBUG_MATRICES)
				if (row >= NROWS || col >= NCOLS)
					THROW_EXCEPTION( format("Indexes (%lu,%lu) out of range. Matrix is %lux%lu",static_cast<unsigned long>(row),static_cast<unsigned long>(col),static_cast<unsigned long>(NROWS),static_cast<unsigned long>(NCOLS)) );
		#endif
				return m_Val[NCOLS*row+col];
			}

			/** Gets the matrix as a string in matlab format, for example: [a11 a12 a12;a21 a22 a23]
			  * \sa fromMatlabStringFormat
			  */
			std::string  inMatlabFormat() const
			{
				std::stringstream  s;
				s << "[";
				s << std::scientific;
				for (size_t i=0;i<NROWS;i++)
				{
					for (size_t j=0;j<NCOLS;j++)
						s << get_unsafe(i,j) << " ";
					if (i<NROWS-1)	s << ";";
				}
				s << "]";
				return s.str();
			}

			/** Read a matrix from a string in Matlab-like format, for example "[1 0 2; 0 4 -1]"
			  *  The string must start with '[' and end with ']'. Rows are separated by semicolons ';' and
			  *  columns in each row by one or more whitespaces ' ' or tabs.
			  *
			  * This format is also used for CConfigFile::read_matrix.
			  *
			  *  This template method can be instantiated for matrices of the types: int, long, unsinged int, unsigned long, float, double, long double
			  *
			  * \return true on success. false if the string is malformed, or it is of the wrong size.
			  * \sa inMatlabFormat, CConfigFile::read_matrix
			  */
			bool fromMatlabStringFormat(const std::string &s)
			{
				CMatrixTemplate<T>	M;
				if (!M.fromMatlabStringFormat(s)) return false;
				if (M.getColCount()!=NCOLS || M.getRowCount()!=NROWS) return false;
				*this = M;
				return true; // Ok
			}


			/** Returns the inverse of the matrix in "out_inv"
			  * \sa inv_fast
			  */
			void inv(CMatrixFixedNumeric<T,NROWS,NCOLS>& out_inv) const {
				mrpt::math::invMatrix(*this,out_inv);
			}

			/** Returns the inverse of the matrix in "out_inv" , DESTROYING the current matrix
			  */
			void inv_fast(CMatrixFixedNumeric<T,NROWS,NCOLS>& out_inv) {
				mrpt::math::invMatrix_destroySrc(*this,out_inv);
			} // end of "inv_fast"

			/** this = A*B */
			template <size_t NC1>
			void multiply(const CMatrixFixedNumeric<T,NROWS,NC1> &A, const CMatrixFixedNumeric<T,NC1,NCOLS> &B  ) {
#if MRPT_HAS_SSE2 && defined(MRPT_USE_SSE2)
				mrpt::math::multiply_SIMD(A,B,*this);
#else
				mrpt::math::multiply(A,B,*this);
#endif
			}

			/** this = A * A^t */
			template <size_t NC1>
			void multiply_AAt(const CMatrixFixedNumeric<T,NROWS,NC1> &A) {
				mrpt::math::multiply_AAt(A,*this);
			}

			/** Computes the vector v = this * a, where "a" is a column vector of the appropriate length.
			  */
			void multiply_Ab( const std::vector<T>& a, std::vector<T>& out_v ) const {
				mrpt::math::multiply_Ab(*this,a,out_v);
			}

			/**	Calculate the operation this = A*B*C
			* \sa multiply_ABCT
			*/
			template <size_t N1,size_t N2>
			void multiply_ABC(
				const CMatrixFixedNumeric<T,NROWS,N1> &A,
				const CMatrixFixedNumeric<T,N1,N2>    &B,
				const CMatrixFixedNumeric<T,N2,NCOLS> &C)
			{
				this->zeros();
				for (size_t i=0;i<NROWS;i++)
					for (size_t l=0;l<N2;l++)
					{
						T sumAccumInner = 0;
						for (size_t k=0;k<N1;k++)
							sumAccumInner += A.get_unsafe(i,k) * B.get_unsafe(k,l);
						for (size_t j=0;j<NCOLS;j++)
							get_unsafe(i,j) += sumAccumInner * C.get_unsafe(l,j);
					}
			}

			/**	Calculate the operation this = A*B*Ct
			* \sa multiply_ABC
			*/
			template <size_t N1,size_t N2>
			void multiply_ABCt(
				const CMatrixFixedNumeric<T,NROWS,N1> &A,
				const CMatrixFixedNumeric<T,N1,N2>    &B,
				const CMatrixFixedNumeric<T,NCOLS,N2> &C)
			{
				this->zeros();
				for (size_t i=0;i<NROWS;i++)
					for (size_t l=0;l<N2;l++)
					{
						T sumAccumInner = 0;
						for (size_t k=0;k<N1;k++)
							sumAccumInner += A.get_unsafe(i,k) * B.get_unsafe(k,l);
						for (size_t j=0;j<NCOLS;j++)
							get_unsafe(i,j) += sumAccumInner * C.get_unsafe(j,l);
					}
			}

			/** Sum to this matrix A and its transpose: this = this + A + At
			*/
			template <size_t M1>
			void add_AAt(const CMatrixFixedNumeric<T,M1,M1> &A)
			{
				ASSERT_(M1<NROWS && M1<NCOLS)
				for (size_t i=0;i<M1;i++)
					for (size_t j=i+1; j < M1; j++)
					{
						const T v = A.get_unsafe(i,j)+A.get_unsafe(j,i);
						get_unsafe(i,j) += v;
						get_unsafe(j,i) += v;
					}
				for (size_t i=0;i<M1;i++)
					get_unsafe(i,i) += 2*A.get_unsafe(i,i);
			}

			/** Multiplies all elements by a scalar */
			void operator *= (const T val) {
#if MRPT_HAS_SSE2 && defined(MRPT_USE_SSE2)
				mrpt::math::multiply_SIMD(*this,val);
#else
				mrpt::math::multiply(*this,val);
#endif
			}
			/** Multiplies all elements by a scalar */
			void operator /= (const T val) {
				ASSERTMSG_(val!=0, "division by zero")
#if MRPT_HAS_SSE2 && defined(MRPT_USE_SSE2)
				mrpt::math::multiply_SIMD(*this,T(1)/val);
#else
				mrpt::math::multiply(*this,T(1)/val);
#endif
			}
			/** Sum a scalar to all elements */
			void operator += (const T val) {
#if MRPT_HAS_SSE2 && defined(MRPT_USE_SSE2)
				mrpt::math::sumInPlace_SIMD(*this,val);
#else
				mrpt::math::sumInPlace(*this,val);
#endif
			}
			/** Substract a scalar to all elements */
			void operator -= (const T val) {
#if MRPT_HAS_SSE2 && defined(MRPT_USE_SSE2)
				mrpt::math::sumInPlace_SIMD(*this,-val);
#else
				mrpt::math::sumInPlace(*this,-val);
#endif
			}

			/** Sum a matrix to this one */
			void operator += (const CMatrixFixedNumeric<T,NROWS,NCOLS>& m) {
#if MRPT_HAS_SSE2 && defined(MRPT_USE_SSE2)
				mrpt::math::sumInPlace_SIMD(*this,m);
#else
				mrpt::math::sumInPlace(*this,m);
#endif
			}
			/** Substract a matrix to this one */
			void operator -= (const CMatrixFixedNumeric<T,NROWS,NCOLS>& m) {
#if MRPT_HAS_SSE2 && defined(MRPT_USE_SSE2)
				mrpt::math::substractInPlace_SIMD(*this,m);
#else
				mrpt::math::substractInPlace(*this,m);
#endif
			}

			/** Returns the sum of all the elements
			  */
			T sumAll() const {
#if MRPT_HAS_SSE2 && defined(MRPT_USE_SSE2)
				return mrpt::math::sumMatrixAllElements_SIMD(*this);
#else
				return mrpt::math::sumMatrixAllElements(*this);
#endif
			}

			/** Returns the minimum of all the elements \sa minimumAndmaximum */
			T minimum() const {
#if MRPT_HAS_SSE2 && defined(MRPT_USE_SSE2)
				return mrpt::math::minimumMatrix_SIMD(*this);
#else
				return mrpt::math::minimumMatrix(*this);
#endif
			}

			/** Returns the maximum of all the elements  \sa minimumAndmaximum */
			T maximum() const {
#if MRPT_HAS_SSE2 && defined(MRPT_USE_SSE2)
				return mrpt::math::maximumMatrix_SIMD(*this);
#else
				return mrpt::math::maximumMatrix(*this);
#endif
			}

			/** Returns the minimum & maximum of all the elements */
			void minimumAndMaximum(T &val_min, T &val_max) const {
#if MRPT_HAS_SSE2 && defined(MRPT_USE_SSE2)
				mrpt::math::minimumAndMaximumMatrix_SIMD(*this,val_min,val_max);
#else
				mrpt::math::minimumAndMaximumMatrix(*this,val_min,val_max);
#endif
			}

			/** Returns the determinant of the matrix  */
			T det() const {
				return mrpt::math::detMatrix(*this);
			}

			/** Applies the sqrt to all the elements  */
			void Sqrt() {
				mrpt::math::sqrtMatrix(*this);
			}

			/** Computes the eigenvalues/eigenvector decomposition of a symmetric matrix.
			 *    The decomposition is: M = Z  D  Z<sup>T</sup>, where columns in Z are the
			 *	  eigenvectors and the diagonal matrix D contains the eigenvalues
			 *    as diagonal elements, sorted in <i>ascending</i> order.
			 *    The algorithm is taken from "Numerical recipes in C", freely available online.
			 */
			void eigenVectors( CMatrixFixedNumeric<T,NROWS,NROWS>& Z, CMatrixFixedNumeric<T,NROWS,NROWS>& D) const {
				mrpt::math::eigenVectorsMatrix(*this, Z,D);
			}

			/** Copy the upper half of the matrix into the lower half */
			void  force_symmetry() {
				for (size_t i=0;i<NROWS;i++)
					for (size_t j=i+1;j<NCOLS;j++)
						get_unsafe(i,j) = get_unsafe(j,i);
			}

			/** @name Import/export as text
				@{ */

			/** Save matrix to a text file, compatible with MATLAB text format.
				* \param file The target filename.
				* \param fileFormat See TMatrixTextFileFormat. The format of the numbers in the text file.
				* \param appendMRPTHeader Insert this header to the file "% File generated by MRPT. Load with MATLAB with: VAR=load(FILENAME);"
				* \param userHeader Additional text to be written at the head of the file. Typically MALAB comments "% This file blah blah". Final end-of-line is not needed.
				* \sa loadFromTextFile, CMatrixTemplate::inMatlabFormat, DEBUG_SAVE_MATRIX
				*/
			void  saveToTextFile(
				const std::string &file,
				TMatrixTextFileFormat fileFormat = MATRIX_FORMAT_ENG,
				bool    appendMRPTHeader = false,
				const std::string &userHeader = std::string("")
				) const
			{
				mrpt::math::saveMatrixToTextFile(*this, file,fileFormat,appendMRPTHeader,userHeader);
			}

			/** @{ */

			/** This executes the operation \f$ \mathbf{R} = \mathbf{H} \mathbf{C} \mathbf{H}^t \f$, where 'this' matrix is \f$ \mathbf{H} \f$ and \f$ \mathbf{C} \f$ is symmetric, in an efficient and numerically stable way.
			  *  If 'this' matrix is \f$ N \times M \f$, then \f$ \mathbf{C} \f$ must be \f$ M \times M \f$, and the result matrix \f$ R \f$ will be \f$ N \times N \f$.
			  * The result from this method is assured to be symmetric (if \f$ \mathbf{C} \f$ is symmetric), whereas executing:
			  \code
				 R = H * C * (~H);
			  \endcode
			  * may lead to non-symmetric matrixes due to numerical rounding errors. In addition, this method is more efficient that the code above (see the MRPT's code examples on matrixes).
			  *
			  *  If accumResultInOutput=true, the contents of the output matrix will not be cleared, but added to the result of the operations. In this case it must have the correct size
			  *   before calling or an exception will be raised since this probably is a bug.
			  *
			  * \sa multiply_HCHt_scalar
			  */
			void  multiply_HCHt(
				const CMatrixFixedNumeric<T,NCOLS,NCOLS> &C,
				CMatrixFixedNumeric<T,NROWS,NROWS>   &R,
				bool                                 accumResultInOutput = false  ) const
			{
				mrpt::math::multiply_HCHt(*this,C,R,accumResultInOutput);
			}

			/** Like multiply_HCHt but for resulting matrices of size 1x1, which is returned as a scalar */
			T multiply_HCHt_scalar( const CMatrixFixedNumeric<T,NCOLS,NCOLS> &C ) const
			{
				ASSERT_(NROWS==1)
				CMatrixFixedNumeric<T,NROWS,NROWS>   R;
				this->multiply_HCHt(C,R);
				return R.m_Val[0];
			}

			/** Like CMatrixFixedNumeric::multiply_HCHt but for Ht being given transpose. */
			void  multiply_HtCH(
				const CMatrixFixedNumeric<T,NROWS,NROWS> &C,
				CMatrixFixedNumeric<T,NCOLS,NCOLS>   &R,
				bool                                 accumResultInOutput = false  ) const
			{
				mrpt::math::multiply_HtCH(*this,C,R,accumResultInOutput);
			}

			/** Like multiply_HtCH but for resulting matrices of size 1x1, which is returned as a scalar */
			T multiply_HtCH_scalar( const CMatrixFixedNumeric<T,NROWS,NROWS> &C ) const
			{
				ASSERT_(NCOLS==1)
				CMatrixFixedNumeric<T,NCOLS,NCOLS>   R;
				this->multiply_HtCH(C,R);
				return R.m_Val[0];
			}

			/** Used for "det" */
			int pivot(const size_t row)
			{
				size_t k = row;
				double amax,temp;

				amax = -1;
				for (size_t i=row; i < NROWS; i++)
					if ( (temp = fabs( get_unsafe(i,row))) > amax && temp != 0)
					{
						amax = temp;
						k = i;
					}
				if (get_unsafe(k,row) == T(0))
					return -1;
				if (k != row)
				{
					// Swap rows "k" & "row":
					swap_rows(k,row);
					return static_cast<int>( k );
				}
				return 0;
			} // end pivot

			void swap_rows(size_t i1,size_t i2)
			{
				T  tmprow[NCOLS];
				::memcpy(tmprow, m_Val+i1*NCOLS, sizeof(tmprow));
				::memcpy(m_Val+i1*NCOLS,m_Val+i2*NCOLS, sizeof(tmprow));
				::memcpy(m_Val+i2*NCOLS,tmprow, sizeof(tmprow));
			}

			/** auxiliary member to get element (i,j), starting at (1,1) instead of (0,0) */
			T _E(const size_t row, const size_t col) const {
				return m_Val[NCOLS*(row-1)+col-1];
			}
			/** auxiliary member to get element (i,j), starting at (1,1) instead of (0,0) */
			T & _E(const size_t row, const size_t col) {
				return m_Val[NCOLS*(row-1)+col-1];
			}

		}; // end of class definition ------------------------------


		/** Computes the vector v = A * a, where "a" is a column vector of the appropriate length. */
		template <typename T,size_t N,size_t M>
		void multiply_Ab( const CMatrixFixedNumeric<T,N,M>& A, const std::vector<T>& a, std::vector<T>& out_v )
		{
			// Matrix A is    NxM
			//  the vector a is  Mx1
			//  the output v is  Nx1
			out_v.resize(N);
			typename std::vector<T>::const_iterator 	a_it;
			typename std::vector<T>::iterator 		v_it;
			size_t  						i,j;
			for (i=0, v_it=out_v.begin(); i < N; i++)
			{
				T accum = 0;
				for (j=0, a_it=a.begin(); j < M; j++)
					accum += *a_it++ * A.get_unsafe(i,j);
				*v_it++ = accum;
			}
		} // end of multiply_Ab

		/** Multiply 2 matrices: RESULT = A * A^t
		  */
		template <typename T,size_t M1R,size_t M1C>
		void multiply_AAt(
			const CMatrixFixedNumeric<T,M1R,M1C>& m1,
			CMatrixFixedNumeric<T,M1R,M1R>& RESULT )
		{
			// If m1 is me, make a copy:
			if ((void*)&m1==(void*)&RESULT)
			{
				// Save result in a temporal matrix:
				T  temp[M1R*M1R];

				T  *ptr = temp;
				size_t i;
				for (i=0; i < M1R; i++)
				{
					for (size_t j=i; j < M1R; j++)
					{
						T accum = 0;
						for (size_t k=0; k < M1C; k++)
							accum += m1.get_unsafe(i,k) * m1.get_unsafe(j,k);
						*(ptr++) = accum;
					}
				}
				// Copy from temp:
				ptr = temp;
				for (i=0; i < M1R; i++)
					for (size_t j=i; j < M1R; j++)
						RESULT.get_unsafe(i,j) = RESULT.get_unsafe(j,i) = *(ptr++);
			}
			else
			{
				// Work directly over the data:
				for (size_t i=0; i < M1R; i++)
				{
					for (size_t j=i; j < M1R; j++)
					{
						T accum = 0;
						for (size_t k=0; k < M1C; k++)
							accum += m1.get_unsafe(i,k) * m1.get_unsafe(j,k);
						RESULT.get_unsafe(i,j) = RESULT.get_unsafe(j,i) = accum;
					}
				}
			}
		} // end multiply_AAt


		/** Multiply 2 matrices: RESULT = m1 * m2
		  */
		template <typename T,size_t NROWS,size_t NCOLS,size_t M1C>
		void multiply(
			const CMatrixFixedNumeric<T,NROWS,M1C>& m1,
			const CMatrixFixedNumeric<T,M1C,NCOLS>& m2,
			CMatrixFixedNumeric<T,NROWS,NCOLS>& RESULT )
		{
			MRPT_TRY_START

			// If one of the matrices is me, make a copy:
			if ( (CMatrixFixedNumeric<T,NROWS,NCOLS>*)(&m1)==&RESULT || (CMatrixFixedNumeric<T,NROWS,NCOLS>*)(&m2)==&RESULT)
			{
				// Save result in a temporary matrix:
				T  temp[NROWS*NCOLS];
				size_t out_idx = 0;
				for (size_t i=0; i < NROWS; i++)
				{
					for (size_t j=0; j < NCOLS; j++)
					{
						T accum = 0;
						for (size_t k=0; k < M1C; k++)
							accum += m1.get_unsafe(i,k) * m2.get_unsafe(k,j);
						temp[out_idx++] = accum;
					}
				}

				// Copy from temp:
				::memcpy(RESULT.m_Val,temp,sizeof(RESULT.m_Val));
			}
			else
			{
				// Work directly over the data:
				for (size_t i=0; i < NROWS; i++)
				{
					for (size_t j=0; j < NCOLS; j++)
					{
						T accum = 0;
						for (size_t k=0; k < M1C; k++)
							accum += m1.get_unsafe(i,k) * m2.get_unsafe(k,j);
						RESULT.get_unsafe(i,j)=accum;
					}
				}
			}

			MRPT_TRY_END
		} // end multiply

		/** Multiply by scalar */
		template <typename T,size_t NROWS,size_t NCOLS>
		void multiply(
			CMatrixFixedNumeric<T,NROWS,NCOLS>& m,
			const T val)
		{
			for (size_t i=0;i<NROWS*NCOLS;i++)
				m.m_Val[i]*=val;
		}

		/** Sum a scalar to all elements */
		template <typename T,size_t NROWS,size_t NCOLS>
		void sumInPlace(
			CMatrixFixedNumeric<T,NROWS,NCOLS>& m,
			const T val)
		{
			for (size_t i=0;i<NROWS*NCOLS;i++)
				m.m_Val[i]+=val;
		}

		/** Sum two matrices: M+=A */
		template <typename T,size_t NROWS,size_t NCOLS>
		void sumInPlace(
			CMatrixFixedNumeric<T,NROWS,NCOLS>& M,
			const CMatrixFixedNumeric<T,NROWS,NCOLS>& A)
		{
			for (size_t i=0;i<NROWS*NCOLS;i++)
				M.m_Val[i]+=A.m_Val[i];
		}

		/** Sum all the elements in the matrix */
		template <typename T,size_t NROWS,size_t NCOLS>
		T sumMatrixAllElements( const CMatrixFixedNumeric<T,NROWS,NCOLS>& M ) {
			T  r = 0;
			for (size_t k=0;k<NROWS*NCOLS;k++)
				r+=M.m_Val[k];
			return r;
		}

		/** Sum all the elements in the matrix */
		template <typename T,size_t NROWS,size_t NCOLS>
		T minimumMatrix(const CMatrixFixedNumeric<T,NROWS,NCOLS>& M) {
			T mi = std::numeric_limits<T>::max();
			for (size_t i=0;i<NROWS*NCOLS;i++)
				mi = std::min(mi,M.m_Val[i]);
			return mi;
		}

		/** Sum all the elements in the matrix */
		template <typename T,size_t NROWS,size_t NCOLS>
		T maximumMatrix(const CMatrixFixedNumeric<T,NROWS,NCOLS>& M) {
			T ma = std::numeric_limits<T>::min();
			for (size_t i=0;i<NROWS*NCOLS;i++)
				ma = std::max(ma,M.m_Val[i]);
			return ma;
		}

		/** Sum all the elements in the matrix */
		template <typename T,size_t NROWS,size_t NCOLS>
		void minimumAndMaximumMatrix(const CMatrixFixedNumeric<T,NROWS,NCOLS>& M, T &val_min, T &val_max) {
			T mi = std::numeric_limits<T>::max();
			T ma = std::numeric_limits<T>::min();
			for (size_t i=0;i<NROWS*NCOLS;i++)
			{
				mi = std::min(mi,M.m_Val[i]);
				ma = std::max(ma,M.m_Val[i]);
			}
			val_min = mi;
			val_max = ma;
		}

		/** Sum two matrices: M-=A */
		template <typename T,size_t NROWS,size_t NCOLS>
		void substractInPlace(
			CMatrixFixedNumeric<T,NROWS,NCOLS>& M,
			const CMatrixFixedNumeric<T,NROWS,NCOLS>& A)
		{
			for (size_t i=0;i<NROWS*NCOLS;i++)
				M.m_Val[i]-=A.m_Val[i];
		}

		/** Return the sum of two matrices: RET = A+B */
		template <typename T,size_t NROWS,size_t NCOLS>
		CMatrixFixedNumeric<T,NROWS,NCOLS> sum(
			const CMatrixFixedNumeric<T,NROWS,NCOLS>& A,
			const CMatrixFixedNumeric<T,NROWS,NCOLS>& B)
		{
			CMatrixFixedNumeric<T,NROWS,NCOLS> ret = A;
			A+=B;
			return A;
		}
		/** Return the sum of two matrices: RET = A-B */
		template <typename T,size_t NROWS,size_t NCOLS>
		CMatrixFixedNumeric<T,NROWS,NCOLS> substract(
			const CMatrixFixedNumeric<T,NROWS,NCOLS>& A,
			const CMatrixFixedNumeric<T,NROWS,NCOLS>& B)
		{
			CMatrixFixedNumeric<T,NROWS,NCOLS> ret = A;
			A-=B;
			return A;
		}


		/** Use the member method with the same name in matrix classes. */
		template <typename T, size_t N, size_t M>
		void multiply_HCHt(
			const CMatrixFixedNumeric<T,N,M> &H,
			const CMatrixFixedNumeric<T,M,M> &C,
			CMatrixFixedNumeric<T,N,N>   &R,
			bool                                 accumResultInOutput )
		{
			MRPT_TRY_START

			ASSERTMSG_( (void*)&C != (void*)&H, "C and H must be different matrices." )
			ASSERTMSG_( (void*)&R != (void*)&H, "R and H must be different matrices." )
			ASSERTMSG_( (void*)&C != (void*)&R,  "C and R must be different matrices.")

			CMatrixFixedNumeric<T,N,M>	R_;

			// First compute R_ = this * C:
			for (size_t i=0;i<N;i++)
				for (size_t j=0;j<M;j++)
				{
					T sumAccum = 0;
					for (size_t l=0;l<M;l++)
						sumAccum += H.get_unsafe(i,l) * C.get_unsafe(l,j);
					R_.get_unsafe(i,j)  = sumAccum;
				}

			// Now compute R = R_ * (~this):
			for (size_t i=0;i<N;i++)
				for (size_t j=i;j<N;j++)
				{
					T sumAccum = accumResultInOutput ? R.get_unsafe(i,j) : 0;
					for (size_t l=0;l<M;l++)
						sumAccum += R_.get_unsafe(i,l) * H.get_unsafe(j,l);
					R.get_unsafe(i,j) = R.get_unsafe(j,i) = sumAccum;
				}
			MRPT_TRY_END
		}

		/** Use the member method with the same name in matrix classes. */
		template <typename T, size_t N, size_t M>
		void multiply_HtCH(
			const CMatrixFixedNumeric<T,M,N> &H,
			const CMatrixFixedNumeric<T,M,M> &C,
			CMatrixFixedNumeric<T,N,N>   &R,
			bool                                 accumResultInOutput )
		{
			MRPT_TRY_START

			ASSERTMSG_( (void*)&C != (void*)&H, "C and H must be different matrices." )
			ASSERTMSG_( (void*)&R != (void*)&H, "R and H must be different matrices." )
			ASSERTMSG_( (void*)&C != (void*)&R,  "C and R must be different matrices.")

			CMatrixFixedNumeric<T,N,M>	R_;

			// First compute R_ = this * C:
			for (size_t i=0;i<N;i++)
				for (size_t j=0;j<M;j++)
				{
					T sumAccum = 0;
					for (size_t l=0;l<M;l++)
						sumAccum += H.get_unsafe(l,i) * C.get_unsafe(l,j);
					R_.get_unsafe(i,j)  = sumAccum;
				}

			// Now compute R = R_ * (~this):
			for (size_t i=0;i<N;i++)
				for (size_t j=i;j<N;j++)
				{
					T sumAccum = accumResultInOutput ? R.get_unsafe(i,j) : 0;
					for (size_t l=0;l<M;l++)
						sumAccum += R_.get_unsafe(i,l) * H.get_unsafe(l,j);
					R.get_unsafe(i,j) = R.get_unsafe(j,i) = sumAccum;
				}
			MRPT_TRY_END
		}

		/** @name Typedefs for common sizes
			@{ */
		typedef CMatrixFixedNumeric<double,2,2> CMatrixDouble22;
		typedef CMatrixFixedNumeric<double,3,3> CMatrixDouble33;
		typedef CMatrixFixedNumeric<double,4,4> CMatrixDouble44;
		typedef CMatrixFixedNumeric<double,6,6> CMatrixDouble66;
		typedef CMatrixFixedNumeric<double,1,3> CMatrixDouble13;
		typedef CMatrixFixedNumeric<double,3,1> CMatrixDouble31;
		typedef CMatrixFixedNumeric<double,1,2> CMatrixDouble12;
		typedef CMatrixFixedNumeric<double,2,1> CMatrixDouble21;
		typedef CMatrixFixedNumeric<double,6,1> CMatrixDouble61;
		typedef CMatrixFixedNumeric<double,1,6> CMatrixDouble16;

		typedef CMatrixFixedNumeric<float,2,2> CMatrixFloat22;
		typedef CMatrixFixedNumeric<float,3,3> CMatrixFloat33;
		typedef CMatrixFixedNumeric<float,4,4> CMatrixFloat44;
		typedef CMatrixFixedNumeric<float,6,6> CMatrixFloat66;
		typedef CMatrixFixedNumeric<float,1,3> CMatrixFloat13;
		typedef CMatrixFixedNumeric<float,3,1> CMatrixFloat31;
		typedef CMatrixFixedNumeric<float,1,2> CMatrixFloat12;
		typedef CMatrixFixedNumeric<float,2,1> CMatrixFloat21;
		typedef CMatrixFixedNumeric<float,6,1> CMatrixFloat61;
		typedef CMatrixFixedNumeric<float,1,6> CMatrixFloat16;
		/**  @} */

		/** Multiply a matrix by a scalar
		  */
		template <typename T,size_t NROWS,size_t NCOLS>
		CMatrixFixedNumeric<T,NROWS,NCOLS> operator *(const CMatrixFixedNumeric<T,NROWS,NCOLS>& m1,const T v) {
			CMatrixFixedNumeric<T,NROWS,NCOLS> res = m1;
			res*=v;
			return res;
		}

		/** Multiply 2 matrices with the * operator
		  */
		template <typename T,size_t NROWS,size_t NCOLS,size_t M1C>
		CMatrixFixedNumeric<T,NROWS,NCOLS> operator *(
			const CMatrixFixedNumeric<T,NROWS,M1C>& m1,
			const CMatrixFixedNumeric<T,M1C,NCOLS>& m2)
		{
			CMatrixFixedNumeric<T,NROWS,NCOLS> res(false,false);
			multiply(m1,m2,res);
			return res;
		}

		/** Add 2 matrices with the + operator */
		template <typename T,size_t NROWS,size_t NCOLS>
		CMatrixFixedNumeric<T,NROWS,NCOLS> operator +(
			const CMatrixFixedNumeric<T,NROWS,NCOLS>& m1,
			const CMatrixFixedNumeric<T,NROWS,NCOLS>& m2)
		{
			CMatrixFixedNumeric<T,NROWS,NCOLS> res = m1;
			res+=m2;
			return res;
		}

		/** Substract 2 matrices with the - operator */
		template <typename T,size_t NROWS,size_t NCOLS>
		CMatrixFixedNumeric<T,NROWS,NCOLS> operator -(
			const CMatrixFixedNumeric<T,NROWS,NCOLS>& m1,
			const CMatrixFixedNumeric<T,NROWS,NCOLS>& m2)
		{
			CMatrixFixedNumeric<T,NROWS,NCOLS> res = m1;
			res-=m2;
			return res;
		}

		/** unary negative operator -
		 */
		template <typename T,size_t NROWS,size_t NCOLS>
		CMatrixFixedNumeric<T,NCOLS,NROWS> operator -(const CMatrixFixedNumeric<T,NROWS,NCOLS>& m)
		{
			CMatrixFixedNumeric<T,NCOLS,NROWS>	res(false,false);
			for (size_t i=0; i<NROWS*NCOLS; i++)
				res.m_Val[i] = -m.m_Val[i];
			return res;
		}

		/** unary transpose operator ~
		 */
		template <typename T,size_t NROWS,size_t NCOLS>
		CMatrixFixedNumeric<T,NCOLS,NROWS> operator ~(const CMatrixFixedNumeric<T,NROWS,NCOLS>& m)
		{
			CMatrixFixedNumeric<T,NCOLS,NROWS>	res(false,false);
			for (size_t i=0; i<NROWS; i++)
				for (size_t j=0; j<NCOLS; j++)
					res.get_unsafe(j,i) = m.get_unsafe(i,j);
			return res;
		}

		/** unary inverse operator ! */
		template <typename T,size_t NROWS>
		CMatrixFixedNumeric<T,NROWS,NROWS> operator !(const CMatrixFixedNumeric<T,NROWS,NROWS>& m)
		{
			CMatrixFixedNumeric<T,NROWS,NROWS>	res(false,false);
			m.inv(res);
			return res;
		}

		/** Returns the determinant of the matrix  */
		template <typename T,size_t NROWS,size_t NCOLS>
		T detMatrix(const CMatrixFixedNumeric<T,NROWS,NCOLS>& M)
		{
			// general case:
			ASSERTMSG_(NROWS==NCOLS,"Determinant of non-square matrix")

			CMatrixFixedNumeric<T,NROWS,NCOLS> temp(M);
			T	piv,detVal = T(1);

			for (size_t k=0; k < NROWS; k++)
			{
				int		indx = temp.pivot(k);
				if (indx == -1)
					return 0;
				if (indx != 0)
					detVal = - detVal;
				detVal = detVal * temp.get_unsafe(k,k);

				for (size_t i=k+1; i < NROWS; i++)
				{
					piv = temp.get_unsafe(i,k) / temp.get_unsafe(k,k);
					for (size_t j=k+1; j < NROWS; j++)
						temp.get_unsafe(i,j) -= piv * temp.get_unsafe(k,j);
				}
			}
			return detVal;
		} // end of "det"

		/** Returns the determinant of the matrix  */
		template <typename T,size_t NROWS,size_t NCOLS>
		void sqrtMatrix(CMatrixFixedNumeric<T,NROWS,NCOLS>& M)
		{
			for (size_t i=0;i<NROWS*NCOLS;i++)
				M.m_Val[i] = sqrt(M.m_Val[i]);
		}

		template <typename T> void MRPTDLLIMPEXP tred2(T **a, size_t nn, T d[], T e[]);
		template <class T> void  MRPTDLLIMPEXP tqli(T d[], T e[], size_t nn, T **z);

		/** Used from the method from CMatrix classes instead \sa CMatrixFixedNumeric::eigenVectors */
		template <typename T,size_t N>
		void eigenVectorsMatrix(
			const CMatrixFixedNumeric<T,N,N> &M,
			CMatrixFixedNumeric<T,N,N> &Z,
			CMatrixFixedNumeric<T,N,N> &D )
		{
			// TODO: (JL) Rewrite this whole thing, please!
			std::vector<unsigned int>	indxs;
			std::vector<bool>	already;

			size_t	i,j;
			T		**a;
			T		*d,*e;

			MRPT_TRY_START

			// Algorithm from "Numerical recipes in C"

			// Check for symmetry
			// --------------------------------------
#ifdef _DEBUG
			for (i=0;i<N;i++)
				for (j=i;j<N;j++)
					if (M.get_unsafe(i,j)!=M.get_unsafe(j,i))
					{
						THROW_EXCEPTION(format("eigenVectors: The matrix is not symmetric! m(%lu,%lu)=%.16e != m(%lu,%lu)=%.16e\n",
							static_cast<unsigned long>(i),static_cast<unsigned long>(j), static_cast<double> ( M.get_unsafe(i,j) ),
							static_cast<unsigned long>(j),static_cast<unsigned long>(i), static_cast<double> ( M.get_unsafe(j,i) )) )
					}
#endif

			// Copy the matrix content to "a":
			// --------------------------------------
			typedef T* matrix_type_ptr;

			a = new matrix_type_ptr[N+1];
			for (i=1;i<=N;i++)	a[i] = new T[N+1];
			d = new T[N+1];
			e = new T[N+1];

			for (i=1;i<=N;i++)
				for (j=1;j<=N;j++)
					a[i][j] = M.get_unsafe(i-1,j-1);

			// Algorithm
			// --------------------------------------
			tred2( a, N, d, e);
			tqli(d,e,N,a);

			// In "d" are the eigenvalues
			// In "a" are the eigenvectors as columns:

			// SORT: Build a list of the N index in
			//   ascending order of eigenvalue:
			// --------------------------------------
			indxs.resize(N+1);
			already.resize(N+1, false);

			for (i=1;i<=N;i++)
			{
				size_t		minIndx = std::numeric_limits<size_t>::max();
				for (j=1;j<=N;j++)
					if (!already[j])
					{
						if (minIndx==std::numeric_limits<size_t>::max())		minIndx = j;
						else
							if (d[j]<d[minIndx])	minIndx = j;
					}

				// The i'th sorted element:
				indxs[i] = static_cast<unsigned int> ( minIndx );
				already[minIndx] = true;
			}

			for (i=1;i<=N;i++)
				ASSERT_(already[i]);

			// Copy results to matrices classes
			// --------------------------------------
			for (i=1;i<=N;i++)
				for (j=1;j<=N;j++)
				{
					Z(i-1,j-1) = a[i][indxs[j]];
					if (i==j)
					{
						if (d[indxs[j]]<0)
								D(i-1,i-1) = -d[indxs[j]];
						else	D(i-1,i-1) = d[indxs[j]];
					}
					else		D(i-1,j-1) = 0;
				}

			// Free
			// --------------------------------------
			for (i=1;i<=N;i++)	delete[] a[i];
			delete[]	a;
			delete[]	d;
			delete[]	e;

			MRPT_TRY_END_WITH_CLEAN_UP( std::cout << "[eigenVectors] The matrix leading to exception is:" << std::endl << M << std::endl; )
		}

		/** Used from the method from CMatrix classes instead \sa CMatrixFixedNumeric::eigenVectors - 2x2 specializations */
		template <typename T> void MRPTDLLIMPEXP eigenVectorsMatrix(const CMatrixFixedNumeric<T,2,2> &M,CMatrixFixedNumeric<T,2,2> &Z,CMatrixFixedNumeric<T,2,2> &D );

		/** Returns the determinant of the matrix */
		template <typename T> T detMatrix(const CMatrixFixedNumeric<T,2,2> &M) {
			return M.m_Val[2*0+0]*M.m_Val[2*1+1]-M.m_Val[2*0+1]*M.m_Val[2*1+0];
		}

		/** Returns the determinant of the matrix
 		  *  DET  =  a11(a33a22-a32a23)-a21(a33a12-a32a13)+a31(a23a12-a22a13)
		  */
		template <typename T> T detMatrix(const CMatrixFixedNumeric<T,3,3> &M) {
			return M._E(1,1)*(M._E(3,3)*M._E(2,2)-M._E(3,2)*M._E(2,3))-
				M._E(2,1)*(M._E(3,3)*M._E(1,2)-M._E(3,2)*M._E(1,3))+
				M._E(3,1)*(M._E(2,3)*M._E(1,2)-M._E(2,2)*M._E(1,3));
		}

		/** Returns the determinant of the matrix
		  */
		template <typename T> T detMatrix(const CMatrixFixedNumeric<T,4,4> &M) {
			const float D1 =
				M._E(1+1,1+1)*(M._E(3+1,3+1)*M._E(2+1,2+1)-M._E(3+1,2+1)*M._E(2+1,3+1))-
				M._E(2+1,1+1)*(M._E(3+1,3+1)*M._E(1+1,2+1)-M._E(3+1,2+1)*M._E(1+1,3+1))+
				M._E(3+1,1+1)*(M._E(2+1,3+1)*M._E(1+1,2+1)-M._E(2+1,2+1)*M._E(1+1,3+1));
			const float D2 =
				M._E(1+1,1)*(M._E(3+1,3+1)*M._E(2+1,2+1)-M._E(3+1,2+1)*M._E(2+1,3+1))-
				M._E(2+1,1)*(M._E(3+1,3+1)*M._E(1+1,2+1)-M._E(3+1,2+1)*M._E(1+1,3+1))+
				M._E(3+1,1)*(M._E(2+1,3+1)*M._E(1+1,2+1)-M._E(2+1,2+1)*M._E(1+1,3+1));
			const float D3 =
				M._E(1+1,1)*(M._E(3+1,3+1)*M._E(2+1,2)-M._E(3+1,2)*M._E(2+1,3+1))-
				M._E(2+1,1)*(M._E(3+1,3+1)*M._E(1+1,2)-M._E(3+1,2)*M._E(1+1,3+1))+
				M._E(3+1,1)*(M._E(2+1,3+1)*M._E(1+1,2)-M._E(2+1,2)*M._E(1+1,3+1));
			const float D4 =
				M._E(1+1,1)*(M._E(3+1,3)*M._E(2+1,2)-M._E(3+1,2)*M._E(2+1,3))-
				M._E(2+1,1)*(M._E(3+1,3)*M._E(1+1,2)-M._E(3+1,2)*M._E(1+1,3))+
				M._E(3+1,1)*(M._E(2+1,3)*M._E(1+1,2)-M._E(2+1,2)*M._E(1+1,3));
			return M._E(1,1)*D1 - M._E(1,2)*D2 + M._E(1,3)*D3 - M._E(1,4)*D4;
		}

		/** Returns the inverse of the matrix in "out_inv"
		  *  \sa CMatrixFixedNumeric::inv
		  */
		template <typename T,size_t NROWS,size_t NCOLS>
		void  invMatrix( const CMatrixFixedNumeric<T,NROWS,NCOLS> &M, CMatrixFixedNumeric<T,NROWS,NCOLS> &out_inv ) {
			CMatrixFixedNumeric<T,NROWS,NCOLS> temp = M;
			invMatrix_destroySrc(temp,out_inv);  // temp is destroyed in inv_fast
		}

		/** Returns the inverse of the matrix in "out_inv" , DESTROYING the original matrix M
		  *  \sa CMatrixFixedNumeric::inv_fast
		  */
		template <typename T,size_t NROWS,size_t NCOLS>
		void invMatrix_destroySrc( CMatrixFixedNumeric<T,NROWS,NCOLS> &M, CMatrixFixedNumeric<T,NROWS,NCOLS> &out_inv ) {
			// Generic implementation (speciations exist for 2x2, 3x3, ...)
			ASSERTMSG_(NROWS==NCOLS,"Inversion of non-square matrix")

			T a1,a2;
			out_inv.unit();
			for (size_t k=0; k < NROWS; k++)
			{
				int indx = M.pivot(k);
				if (indx == -1)
				{
					std::cerr << "[inv] Matrix that leaded to error is:" << std::endl << M << std::endl;
					THROW_EXCEPTION( "Inversion of a singular matrix");
				}

				if (indx != 0)
				{
					// Swap rows:
					M.swap_rows(k,indx);
				}
				a1 = M.get_unsafe(k,k);
				const T a1_i = 1/a1;
				for (size_t j=0; j < NROWS; j++)
				{
					M.get_unsafe(k,j) *= a1_i;
					out_inv.get_unsafe(k,j) *= a1_i;
				}
				for (size_t i=0; i < NROWS; i++)
				{
					if (i != k)
					{
						a2 = M.get_unsafe(i,k);
						for (size_t j=0; j < NROWS; j++)
						{
							M.get_unsafe(i,j)  -= a2 * M.get_unsafe(k,j);
							out_inv.get_unsafe(i,j) -= a2 * out_inv.get_unsafe(k,j);
						}
					}
				}
			}
		}

		/** Returns the inverse of the matrix in "out_inv" */
		template <typename T> void invMatrix( const CMatrixFixedNumeric<T,2,2> &M, CMatrixFixedNumeric<T,2,2> &out_inv )
		{
			// | a11 a12 |-1             |  a22 -a12 |
			// | a21 a22 |    =  1/DET * | -a21  a11 |
			//
			const T det = M.det();
			ASSERTMSG_(det!=0,"Singular matrix")
			const T det_inv = 1.0f / det;
			out_inv.m_Val[2*0+0] = 	M.m_Val[2*1+1];
			out_inv.m_Val[2*0+1] = -M.m_Val[2*0+1];
			out_inv.m_Val[2*1+0] = -M.m_Val[2*1+0];
			out_inv.m_Val[2*1+1] =  M.m_Val[2*0+0];
			out_inv*= det_inv;
		}
		template <typename T> void invMatrix_destroySrc(CMatrixFixedNumeric<T,2,2> &M, CMatrixFixedNumeric<T,2,2>& out_inv) { invMatrix(M,out_inv); }

		/** Returns the inverse of the matrix in "out_inv" */
		template <typename T> void invMatrix(const CMatrixFixedNumeric<T,3,3> &M, CMatrixFixedNumeric<T,3,3>& out_inv)
		{
			// | a11 a12 a13 |-1             |   a33a22-a32a23  -(a33a12-a32a13)   a23a12-a22a13  |
			// | a21 a22 a23 |    =  1/DET * | -(a33a21-a31a23)   a33a11-a31a13  -(a23a11-a21a13) |
			// | a31 a32 a33 |               |   a32a21-a31a22  -(a32a11-a31a12)   a22a11-a21a12  |
			const T det = M.det();
			ASSERTMSG_(det!=0,"Singular matrix")
			const T det_inv = 1.0f / det;
			out_inv._E(1,1)	=  (M._E(3,3)*M._E(2,2)-M._E(3,2)*M._E(2,3) );
			out_inv._E(1,2) =  (-M._E(3,3)*M._E(1,2)+M._E(3,2)*M._E(1,3) );
			out_inv._E(1,3) =  (M._E(2,3)*M._E(1,2)-M._E(2,2)*M._E(1,3) );
			out_inv._E(2,1) =  (-M._E(3,3)*M._E(2,1)+M._E(3,1)*M._E(2,3));
			out_inv._E(2,2) =  (M._E(3,3)*M._E(1,1)-M._E(3,1)*M._E(1,3));
			out_inv._E(2,3) =  (-M._E(2,3)*M._E(1,1)+M._E(2,1)*M._E(1,3));
			out_inv._E(3,1) =  (M._E(3,2)*M._E(2,1)-M._E(3,1)*M._E(2,2));
			out_inv._E(3,2) =  (-M._E(3,2)*M._E(1,1)+M._E(3,1)*M._E(1,2));
			out_inv._E(3,3) =  (M._E(2,2)*M._E(1,1)-M._E(2,1)*M._E(1,2));
			out_inv*= det_inv;
		}
		template <typename T> void invMatrix_destroySrc(CMatrixFixedNumeric<T,3,3> &M, CMatrixFixedNumeric<T,3,3>& out_inv) { invMatrix(M,out_inv); }


		/** Auxiliary function used in the constructor of dyn. matrices from a fixed one */
		template <typename T,size_t NROWS,size_t NCOLS>
		void fixedToDynMatrix( const CMatrixFixedNumeric<T,NROWS,NCOLS> &SRC, CMatrixTemplateNumeric<T> &DST)
		{
			DST.resize(NROWS,NCOLS);
			for (size_t r=0;r<NROWS;r++)
				::memcpy(DST.get_unsafe_row(r), SRC.m_Val+r*NCOLS,sizeof(T)*NCOLS);
		}

		/** Auxiliary function used in CMatrixTemplate */
		template <typename T,size_t NROWS,size_t NCOLS>
		void insertMatrixFixTransposeIntoDyn(
			CMatrixTemplate<T> &M,
			const size_t nRow,
			const size_t nCol,
			const CMatrixFixedNumeric<T,NROWS,NCOLS> &in)
		{
			ASSERTMSG_( (nRow+NCOLS <= M.getRowCount() ) && (nCol+NROWS<= M.getColCount()), "insertMatrix: Row or Col index out of bounds")
			for (size_t c=0;c<NCOLS;c++)
				for (size_t r=0;r<NROWS;r++)
					M.get_unsafe(nRow+c,nCol+r) = in.get_unsafe(r,c);
		}

		/** Auxiliary function used in CMatrixTemplate */
		template <typename T,size_t NROWS,size_t NCOLS>
		void insertMatrixFixIntoDyn(
			CMatrixTemplate<T> &M,
			const size_t nRow,
			const size_t nCol,
			const CMatrixFixedNumeric<T,NROWS,NCOLS> &in)
		{
			ASSERTMSG_( (nRow+NROWS <= M.getRowCount() ) && (nCol+NCOLS <= M.getColCount()), "insertMatrix: Row or Col index out of bounds")
			for (size_t r=0;r<NROWS;r++)
				::memcpy( M.get_unsafe_row(r+nRow)+nCol, in.m_Val+r*NCOLS, NCOLS*sizeof(T));
		}

		/** Used from CMatrixTemplate::extractMatrix */
		template <typename T,size_t NROWS,size_t NCOLS>
		void extractFixMatrixFromDynMatrix(
			const CMatrixTemplate<T> &M,
			const size_t nRow,
			const size_t nCol,
			CMatrixFixedNumeric<T,NROWS,NCOLS> &outMat)
		{
			ASSERTMSG_( (nRow+NROWS <= M.getRowCount() ) && (nCol+NCOLS <= M.getColCount()), "extractMatrix: Row or Col index out of bounds")
			for (size_t r=0;r<NROWS;r++)
				::memcpy( outMat.m_Val+r*NCOLS, M.get_unsafe_row(r+nRow)+nCol, sizeof(T)*NCOLS );
		}

		/** @name Conversions from point & poses to matrices
			 Used from the "operator =" from poses/points to CMatrixFixedNumeric
		  @{ */

		template <typename T,size_t NROWS,size_t NCOLS> CMatrixFixedNumeric<T,NROWS,NCOLS> & matrixFromPoseOrPoint(CMatrixFixedNumeric<T,NROWS,NCOLS>&M, const CPoint2D &p) { THROW_EXCEPTION("Matrix of the wrong size") }
		template <typename T,size_t NROWS,size_t NCOLS> CMatrixFixedNumeric<T,NROWS,NCOLS> & matrixFromPoseOrPoint(CMatrixFixedNumeric<T,NROWS,NCOLS>&M, const CPoint3D &p) { THROW_EXCEPTION("Matrix of the wrong size") }
		template <typename T,size_t NROWS,size_t NCOLS> CMatrixFixedNumeric<T,NROWS,NCOLS> & matrixFromPoseOrPoint(CMatrixFixedNumeric<T,NROWS,NCOLS>&M, const CPose2D &p) { THROW_EXCEPTION("Matrix of the wrong size") }
		template <typename T,size_t NROWS,size_t NCOLS> CMatrixFixedNumeric<T,NROWS,NCOLS> & matrixFromPoseOrPoint(CMatrixFixedNumeric<T,NROWS,NCOLS>&M, const CPose3D &p) { THROW_EXCEPTION("Matrix of the wrong size") }

		template <> CMatrixDouble21 & matrixFromPoseOrPoint(CMatrixDouble21 &M, const CPoint2D &p);
		template <> CMatrixDouble31 & matrixFromPoseOrPoint(CMatrixDouble31 &M, const CPoint3D &p);
		template <> CMatrixDouble31 & matrixFromPoseOrPoint(CMatrixDouble31 &M, const CPose2D &p);
		template <> CMatrixDouble61 & matrixFromPoseOrPoint(CMatrixDouble61 &M, const CPose3D &p);
		template <> CMatrixDouble12 & matrixFromPoseOrPoint(CMatrixDouble12 &M, const CPoint2D &p);
		template <> CMatrixDouble13 & matrixFromPoseOrPoint(CMatrixDouble13 &M, const CPoint3D &p);
		template <> CMatrixDouble13 & matrixFromPoseOrPoint(CMatrixDouble13 &M, const CPose2D &p);
		template <> CMatrixDouble16 & matrixFromPoseOrPoint(CMatrixDouble16 &M, const CPose3D &p);

		  /** @} */

		/** Read operator from a CStream. The format is compatible with that of CMatrix & CMatrixD */
		template <size_t NROWS,size_t NCOLS>
		mrpt::utils::CStream &operator>>(mrpt::utils::CStream &in, CMatrixFixedNumeric<float,NROWS,NCOLS> & M) {
			CMatrix  aux;
			in >> aux;
			M = aux;
			return in;
		}
		/** Read operator from a CStream. The format is compatible with that of CMatrix & CMatrixD */
		template <size_t NROWS,size_t NCOLS>
		mrpt::utils::CStream &operator>>(mrpt::utils::CStream &in, CMatrixFixedNumeric<double,NROWS,NCOLS> & M) {
			CMatrixD  aux;
			in >> aux;
			M = aux;
			return in;
		}

		/** Write operator for writing into a CStream. The format is compatible with that of CMatrix & CMatrixD */
		template <size_t NROWS,size_t NCOLS>
		mrpt::utils::CStream &operator<<(mrpt::utils::CStream &out,const CMatrixFixedNumeric<float,NROWS,NCOLS> & M) {
			CMatrix aux = CMatrixFloat(M);
			out << aux;
			return out;
		}
		/** Write operator for writing into a CStream. The format is compatible with that of CMatrix & CMatrixD */
		template <size_t NROWS,size_t NCOLS>
		mrpt::utils::CStream &operator<<(mrpt::utils::CStream &out,const CMatrixFixedNumeric<double,NROWS,NCOLS> & M) {
			CMatrixD aux = CMatrixDouble(M);
			out << aux;
			return out;
		}

		/** Textual output stream function.
		  *    Use only for text output, for example:  "std::cout << mat;"
		  */
		template <class T,size_t NROWS, size_t NCOLS>
		std::ostream& operator << (std::ostream& ostrm, const CMatrixFixedNumeric<T,NROWS,NCOLS>& m)
		{
			ostrm << std::setprecision(6);

			for (size_t i=0; i < NROWS; i++)
			{
				for (size_t j=0; j < NCOLS; j++)
					ostrm << std::setw(10) << m.get_unsafe(i,j);
				ostrm << std::endl;
			}
			return ostrm;
		}



		// ----------------------------- Begin of SSE2 specializations ---------------------------

		/** @name Optimized functions (with specializations and/or SSE2 code)
		  @{ */
#if MRPT_HAS_SSE2 && defined(MRPT_USE_SSE2)
		/** Default template (not really implemented ) */
//		template <typename T,size_t NROWS,size_t NCOLS>
//		void multiply_SIMD(CMatrixFixedNumeric<T,NROWS,NCOLS>& m, const T val);
//
//		/** Default template (not really implemented ) */
//		template <typename T,size_t NROWS,size_t NCOLS,size_t M1C>
//		void multiply_SIMD( const CMatrixFixedNumeric<T,NROWS,M1C>& m1, const CMatrixFixedNumeric<T,M1C,NCOLS>& m2, CMatrixFixedNumeric<T,NROWS,NCOLS>& RESULT );
//
//		/** Default template (not really implemented ) */
//		template <typename T,size_t NROWS,size_t NCOLS>
//		void sumInPlace_SIMD( CMatrixFixedNumeric<T,NROWS,NCOLS>& m, const T val);

		/** Multiply by scalar */
		template <size_t NROWS,size_t NCOLS>
		void multiply_SIMD(
			CMatrixFixedNumeric<float,NROWS,NCOLS>& m,
			const float val)
		{
			__m128 cnts = _mm_load1_ps(&val);  // 4 copies of "val"
			const size_t N = NROWS*NCOLS;
			const size_t nBlocks = NROWS*NCOLS >> 2;  // /=4
			float *ptr = m.m_Val;
			for (size_t i=0;i<nBlocks;i++)
			{
				_mm_store_ps(ptr, _mm_mul_ps(_mm_load_ps(ptr), cnts ) );
				ptr+=4;
			}
			// The rest, multiply as normal:
			const size_t Nrest = N-(nBlocks<<2);
			for (size_t i=0;i<Nrest;i++, ptr++ )
				*ptr *= val;
		}

		/** Multiply two 4x4 matrices - SSE2 specialization */
		template <>
		void multiply_SIMD(
			const CMatrixFixedNumeric<float,4,4>& m1,
			const CMatrixFixedNumeric<float,4,4>& m2,
			CMatrixFixedNumeric<float,4,4>& RESULT )
		{
			ASSERT_(&m1!=&RESULT && &m2!=&RESULT)
			// Columns:
			for (unsigned j=0;j<4;j++)
			{
				__m128 m2col;
				((float*)&m2col)[0] = m2.m_Val[0+j];
				((float*)&m2col)[1] = m2.m_Val[4+j];
				((float*)&m2col)[2] = m2.m_Val[8+j];
				((float*)&m2col)[3] = m2.m_Val[12+j];

				for (unsigned i=0;i<4;i++)
				{
					__m128 aux = _mm_mul_ps( m2col, _mm_load_ps(&m1.m_Val[i<<2]) );
					RESULT.m_Val[(i<<2)+j] = ((float*)&aux)[0]+((float*)&aux)[1]+((float*)&aux)[2]+((float*)&aux)[3];
				}
			}
		}

		/** Multiply by scalar - SSE2 specialization */
		template <>
		void multiply_SIMD( CMatrixFixedNumeric<float,2,2>& m, const float val)
		{
			__m128 cnts = _mm_load1_ps(&val);  // 4 copies of "val"
			float *ptr = m.m_Val;
			_mm_store_ps(ptr, _mm_mul_ps(_mm_load_ps(ptr), cnts ) );
		}
		/** Multiply by scalar - SSE2 specialization */
		template <>
		void multiply_SIMD( CMatrixFixedNumeric<float,3,3>& m, const float val)
		{
			__m128 cnts = _mm_load1_ps(&val);  // 4 copies of "val"
			float *ptr = m.m_Val;
			_mm_store_ps(ptr, _mm_mul_ps(_mm_load_ps(ptr), cnts ) ); ptr+=4;
			_mm_store_ps(ptr, _mm_mul_ps(_mm_load_ps(ptr), cnts ) ); ptr+=4;
			*ptr++ *= val;
		}
		/** Multiply by scalar - SSE2 specialization */
		template <>
		void multiply_SIMD( CMatrixFixedNumeric<float,4,4>& m, const float val)
		{
			__m128 cnts = _mm_load1_ps(&val);  // 4 copies of "val"
			float *ptr = m.m_Val;
			_mm_store_ps(ptr, _mm_mul_ps(_mm_load_ps(ptr), cnts ) ); ptr+=4;
			_mm_store_ps(ptr, _mm_mul_ps(_mm_load_ps(ptr), cnts ) ); ptr+=4;
			_mm_store_ps(ptr, _mm_mul_ps(_mm_load_ps(ptr), cnts ) ); ptr+=4;
			_mm_store_ps(ptr, _mm_mul_ps(_mm_load_ps(ptr), cnts ) );
		}

		/** Multiply by scalar - SSE2 specialization */
		template <>
		void multiply_SIMD( CMatrixFixedNumeric<double,2,2>& m, const double val)
		{
			__m128d cnts = _mm_load1_pd(&val);  // 4 copies of "val"
			double *ptr = m.m_Val;
			_mm_store_pd(ptr, _mm_mul_pd(_mm_load_pd(ptr), cnts ) ); ptr+=2;
			_mm_store_pd(ptr, _mm_mul_pd(_mm_load_pd(ptr), cnts ) );
		}
		/** Multiply by scalar - SSE2 specialization */
		template <>
		void multiply_SIMD( CMatrixFixedNumeric<double,3,3>& m, const double val)
		{
			__m128d cnts = _mm_load1_pd(&val);  // 4 copies of "val"
			double *ptr = m.m_Val;
			for (size_t i=0;i<8/2;i++)
			{
				_mm_store_pd(ptr, _mm_mul_pd(_mm_load_pd(ptr), cnts ) );
				ptr+=2;
			}
			*ptr++ *= val;
		}
		/** Multiply by scalar - SSE2 specialization */
		template <>
		void multiply_SIMD( CMatrixFixedNumeric<double,4,4>& m, const double val)
		{
			__m128d cnts = _mm_load1_pd(&val);  // 4 copies of "val"
			double *ptr = m.m_Val;
			for (size_t i=0;i<16/2;i++)
			{
				_mm_store_pd(ptr, _mm_mul_pd(_mm_load_pd(ptr), cnts ) );
				ptr+=2;
			}
		}

		/** Multiply by scalar */
		template <size_t NROWS,size_t NCOLS>
		void multiply_SIMD(
			CMatrixFixedNumeric<double,NROWS,NCOLS>& m,
			const double val)
		{
			__m128d cnts = _mm_load1_pd(&val);  // 2 copies of "val"
			const size_t N = NROWS*NCOLS;
			const size_t nBlocks = NROWS*NCOLS >> 1;  // /=2
			double *ptr = m.m_Val;
			for (size_t i=0;i<nBlocks;i++)
			{
				_mm_store_pd(ptr, _mm_mul_pd(_mm_load_pd(ptr), cnts ) );
				ptr+=2;
			}
			// The rest, multiply as normal:
			const size_t Nrest = N-(nBlocks<<1);
			for (size_t i=0;i<Nrest;i++, ptr++ )
				*ptr *= val;
		}

		/** Sum a scalar */
		template <size_t NROWS,size_t NCOLS>
		void sumInPlace_SIMD(
			CMatrixFixedNumeric<float,NROWS,NCOLS>& m,
			const float val)
		{
			__m128 cnts = _mm_load1_ps(&val);  // 4 copies of "val"
			const size_t N = NROWS*NCOLS;
			const size_t nBlocks = NROWS*NCOLS >> 2;  // /=4
			float *ptr = m.m_Val;
			for (size_t i=0;i<nBlocks;i++)
			{
				_mm_store_ps(ptr, _mm_add_ps(_mm_load_ps(ptr), cnts ) );
				ptr+=4;
			}
			// The rest, multiply as normal:
			const size_t Nrest = N-(nBlocks<<2);
			for (size_t i=0;i<Nrest;i++, ptr++ )
				*ptr += val;
		}

		/** Sum a scalar */
		template <size_t NROWS,size_t NCOLS>
		void sumInPlace_SIMD(
			CMatrixFixedNumeric<double,NROWS,NCOLS>& m,
			const double val)
		{
			__m128d cnts = _mm_load1_pd(&val);  // 2 copies of "val"
			const size_t N = NROWS*NCOLS;
			const size_t nBlocks = NROWS*NCOLS >> 1;  // /=2
			double *ptr = m.m_Val;
			for (size_t i=0;i<nBlocks;i++)
			{
				_mm_store_pd(ptr, _mm_add_pd(_mm_load_pd(ptr), cnts ) );
				ptr+=2;
			}
			// The rest, multiply as normal:
			const size_t Nrest = N-(nBlocks<<1);
			for (size_t i=0;i<Nrest;i++, ptr++ )
				*ptr += val;
		}

		/** Sum two matrices M+=A */
		template <size_t NROWS,size_t NCOLS>
		void sumInPlace_SIMD(
			CMatrixFixedNumeric<float,NROWS,NCOLS>& M,
			const CMatrixFixedNumeric<float,NROWS,NCOLS>& A)
		{
			const size_t N = NROWS*NCOLS;
			const size_t nBlocks = NROWS*NCOLS >> 2;  // /=4
			float *ptr  = M.m_Val;
			const float *ptr2 = A.m_Val;
			for (size_t i=0;i<nBlocks;i++)
			{
				_mm_store_ps(ptr, _mm_add_ps(_mm_load_ps(ptr), _mm_load_ps(ptr2)) );
				ptr+=4;
				ptr2+=4;
			}
			// The rest, as normal:
			const size_t Nrest = N-(nBlocks<<2);
			for (size_t i=0;i<Nrest;i++, ptr++,ptr2++ )
				*ptr += *ptr2;
		}

		/** Sum two matrices M+=A */
		template <size_t NROWS,size_t NCOLS>
		void sumInPlace_SIMD(
			CMatrixFixedNumeric<double,NROWS,NCOLS>& M,
			const CMatrixFixedNumeric<double,NROWS,NCOLS>& A)
		{
			const size_t N = NROWS*NCOLS;
			const size_t nBlocks = NROWS*NCOLS >> 1;  // /=2
			double *ptr  = M.m_Val;
			const double *ptr2 = A.m_Val;
			for (size_t i=0;i<nBlocks;i++)
			{
				_mm_store_pd(ptr, _mm_add_pd(_mm_load_pd(ptr), _mm_load_pd(ptr2)) );
				ptr+=2;
				ptr2+=2;
			}
			// The rest, as normal:
			const size_t Nrest = N-(nBlocks<<1);
			for (size_t i=0;i<Nrest;i++, ptr++,ptr2++ )
				*ptr += *ptr2;
		}

		/** Substract two matrices M-=A */
		template <size_t NROWS,size_t NCOLS>
		void substractInPlace_SIMD(
			CMatrixFixedNumeric<float,NROWS,NCOLS>& M,
			const CMatrixFixedNumeric<float,NROWS,NCOLS>& A)
		{
			const size_t N = NROWS*NCOLS;
			const size_t nBlocks = NROWS*NCOLS >> 2;  // /=4
			float *ptr  = M.m_Val;
			const float *ptr2 = A.m_Val;
			for (size_t i=0;i<nBlocks;i++)
			{
				_mm_store_ps(ptr, _mm_sub_ps(_mm_load_ps(ptr), _mm_load_ps(ptr2)) );
				ptr+=4;
				ptr2+=4;
			}
			// The rest, as normal:
			const size_t Nrest = N-(nBlocks<<2);
			for (size_t i=0;i<Nrest;i++, ptr++,ptr2++ )
				*ptr -= *ptr2;
		}

		/** Substract two matrices M-=A */
		template <size_t NROWS,size_t NCOLS>
		void substractInPlace_SIMD(
			CMatrixFixedNumeric<double,NROWS,NCOLS>& M,
			const CMatrixFixedNumeric<double,NROWS,NCOLS>& A)
		{
			const size_t N = NROWS*NCOLS;
			const size_t nBlocks = NROWS*NCOLS >> 1;  // /=2
			double *ptr  = M.m_Val;
			const double *ptr2 = A.m_Val;
			for (size_t i=0;i<nBlocks;i++)
			{
				_mm_store_pd(ptr, _mm_sub_pd(_mm_load_pd(ptr), _mm_load_pd(ptr2)) );
				ptr+=2;
				ptr2+=2;
			}
			// The rest, as normal:
			const size_t Nrest = N-(nBlocks<<1);
			for (size_t i=0;i<Nrest;i++, ptr++,ptr2++ )
				*ptr -= *ptr2;
		}

		/** Sum all the elements */
		template <size_t NROWS,size_t NCOLS>
		float sumMatrixAllElements_SIMD( const CMatrixFixedNumeric<float,NROWS,NCOLS>& M )
		{
			const size_t N = NROWS*NCOLS;
			const size_t nBlocks = NROWS*NCOLS >> 2;  // /=4
			const float *ptr  = M.m_Val;
			__m128  acum = _mm_setzero_ps();
			for (size_t i=0;i<nBlocks;i++)
			{
				acum = _mm_add_ps(acum, _mm_load_ps(ptr));
				ptr+=4;
			}
			// The rest, as normal:
			float ret = ((float*)&acum)[0]+((float*)&acum)[1]+((float*)&acum)[2]+((float*)&acum)[3];
			const size_t Nrest = N-(nBlocks<<2);
			for (size_t i=0;i<Nrest;i++)
				ret += *ptr++;
			return ret;
		}

		/** Sum all the elements */
		template <size_t NROWS,size_t NCOLS>
		double sumMatrixAllElements_SIMD( const CMatrixFixedNumeric<double,NROWS,NCOLS>& M )
		{
			const size_t N = NROWS*NCOLS;
			const size_t nBlocks = NROWS*NCOLS >> 1;  // /=2
			const double *ptr  = M.m_Val;
			__m128d  acum = _mm_setzero_pd();
			for (size_t i=0;i<nBlocks;i++)
			{
				acum = _mm_add_pd(acum, _mm_load_pd(ptr));
				ptr+=2;
			}
			// The rest, as normal:
			double ret = ((double*)&acum)[0]+((double*)&acum)[1];
			const size_t Nrest = N-(nBlocks<<1);
			for (size_t i=0;i<Nrest;i++)
				ret += *ptr++;
			return ret;
		}

		/** The minimum of a matrix */
		template <size_t NROWS,size_t NCOLS>
		float minimumMatrix_SIMD(const CMatrixFixedNumeric<float,NROWS,NCOLS>& M)
		{
			const size_t N = NROWS*NCOLS;
			const size_t nBlocks = NROWS*NCOLS >> 2;  // /=4
			const float *ptr  = M.m_Val;
			static const float cnst_max = std::numeric_limits<float>::max();
			__m128  acum = _mm_load1_ps(&cnst_max);
			for (size_t i=0;i<nBlocks;i++)
			{
				acum = _mm_min_ps(acum, _mm_load_ps(ptr));
				ptr+=4;
			}
			// The rest, as normal:
			float ret = std::min( std::min( ((float*)&acum)[0],((float*)&acum)[1]), std::min(((float*)&acum)[2],((float*)&acum)[3]));
			const size_t Nrest = N-(nBlocks<<2);
			for (size_t i=0;i<Nrest;i++)
				ret = std::min( ret, *ptr++ );
			return ret;
		}

		/** The minimum of a matrix */
		template <size_t NROWS,size_t NCOLS>
		double minimumMatrix_SIMD(const CMatrixFixedNumeric<double,NROWS,NCOLS>& M)
		{
			const size_t N = NROWS*NCOLS;
			const size_t nBlocks = NROWS*NCOLS >> 1;  // /=2
			const double *ptr  = M.m_Val;
			static const double cnst_max = std::numeric_limits<double>::max();
			__m128d  acum = _mm_load1_pd(&cnst_max);
			for (size_t i=0;i<nBlocks;i++)
			{
				acum = _mm_min_pd(acum, _mm_load_pd(ptr));
				ptr+=2;
			}
			// The rest, as normal:
			double ret = std::min( ((double*)&acum)[0],((double*)&acum)[1]);
			const size_t Nrest = N-(nBlocks<<1);
			for (size_t i=0;i<Nrest;i++)
				ret = std::min( ret, *ptr++ );
			return ret;
		}

		/** The maximum of a matrix */
		template <size_t NROWS,size_t NCOLS>
		float maximumMatrix_SIMD(const CMatrixFixedNumeric<float,NROWS,NCOLS>& M)
		{
			const size_t N = NROWS*NCOLS;
			const size_t nBlocks = NROWS*NCOLS >> 2;  // /=4
			const float *ptr  = M.m_Val;
			static const float cnst_max = std::numeric_limits<float>::min();
			__m128  acum = _mm_load1_ps(&cnst_max);
			for (size_t i=0;i<nBlocks;i++)
			{
				acum = _mm_max_ps(acum, _mm_load_ps(ptr));
				ptr+=4;
			}
			// The rest, as normal:
			float ret = std::max( std::max( ((float*)&acum)[0],((float*)&acum)[1]), std::max(((float*)&acum)[2],((float*)&acum)[3]));
			const size_t Nrest = N-(nBlocks<<2);
			for (size_t i=0;i<Nrest;i++)
				ret = std::max( ret, *ptr++ );
			return ret;
		}

		/** The maximum of a matrix */
		template <size_t NROWS,size_t NCOLS>
		double maximumMatrix_SIMD(const CMatrixFixedNumeric<double,NROWS,NCOLS>& M)
		{
			const size_t N = NROWS*NCOLS;
			const size_t nBlocks = NROWS*NCOLS >> 1;  // /=2
			const double *ptr  = M.m_Val;
			static const double cnst_max = std::numeric_limits<double>::min();
			__m128d  acum = _mm_load1_pd(&cnst_max);
			for (size_t i=0;i<nBlocks;i++)
			{
				acum = _mm_max_pd(acum, _mm_load_pd(ptr));
				ptr+=2;
			}
			// The rest, as normal:
			double ret = std::max( ((double*)&acum)[0],((double*)&acum)[1]);
			const size_t Nrest = N-(nBlocks<<1);
			for (size_t i=0;i<Nrest;i++)
				ret = std::max( ret, *ptr++ );
			return ret;
		}

#endif  // ----------------------------- end of SSE2 specializations -----------------------------
		/**  @} */


	} // End of namespace
} // End of namespace

#endif
