#ifndef _SS_MATRIX_H_
#define _SS_MATRIX_H_

//
//   File : ss_matrix.h
//   Creation date : 28.03.2006 by Szymon Stefanek
//
//   Copyright (C) 2006 Szymon Stefanek (pragma at kvirc dot net)
//
//   This program is FREE software. You can redistribute it and/or
//   modify it under the terms of the GNU General Public License
//   as published by the Free Software Foundation; either version 2
//   of the License, or (at your opinion) any later version.
//
//   This program is distributed in the HOPE that it will be USEFUL,
//   but WITHOUT ANY WARRANTY; without even the implied warranty of
//   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
//   See the GNU General Public License for more details.
//
//   You should have received a copy of the GNU General Public License
//   along with this program. If not, write to the Free Software Foundation,
//   Inc. ,59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
//

#include <stdlib.h> // malloc,free
#include <string.h> // memcpy
#include <stdio.h> // printf


// helper function for hermitian transposition
char matrix_helper_complexConjugate(char i){ return i; };
unsigned char matrix_helper_complexConjugate(unsigned char i){ return i; };
int matrix_helper_complexConjugate(int i){ return i; };
unsigned int matrix_helper_complexConjugate(unsigned int i){ return i; };
short int matrix_helper_complexConjugate(short int i){ return i; };
unsigned short int matrix_helper_complexConjugate(unsigned short int i){ return i; };
long int matrix_helper_complexConjugate(long int i){ return i; };
unsigned long int matrix_helper_complexConjugate(unsigned long int i){ return i; };
long double matrix_helper_complexConjugate(long double i){ return i; };
double matrix_helper_complexConjugate(double i){ return i; };
float matrix_helper_complexConjugate(float i){ return i; };

// helper functions that allow checking for "nearly zero" values
bool matrix_helper_valueNearlyZero(char i){ return i == 0; };
bool matrix_helper_valueNearlyZero(unsigned char i){ return i == 0; };
bool matrix_helper_valueNearlyZero(int i){ return i == 0; };
bool matrix_helper_valueNearlyZero(unsigned int i){ return i == 0; };
bool matrix_helper_valueNearlyZero(short int i){ return i == 0; };
bool matrix_helper_valueNearlyZero(unsigned short int i){ return i == 0; };
bool matrix_helper_valueNearlyZero(long int i){ return i == 0; };
bool matrix_helper_valueNearlyZero(unsigned long int i){ return i == 0; };
bool matrix_helper_valueNearlyZero(double i){ return i > 0.0 ? i < 0.0000001 : i > -0.0000001; };
bool matrix_helper_valueNearlyZero(long double i){ return i > 0.0 ? i < 0.0000001 : i > -0.0000001; };
bool matrix_helper_valueNearlyZero(float i){ return i > 0.0 ? i < 0.0000001 : i > -0.0000001; };


// our nice template matrix
// this is best used with a double, long double or SSComplex data type as T
// but in line of principle can work also with any integer type (you *might* get unexpected results
// with some functions tough: integer inversion is not possible).
template<class T> class SSMatrix
{
protected:
	// internal data class
	template<class Z> class SSMatrixData
	{
	public:
		unsigned int m_uRows;
		unsigned int m_uCols;
		Z * m_pData;
		unsigned int m_uRefs;
	public:
		void reallocate(unsigned int uRows,unsigned int uCols)
		{
			if((m_uRows == uRows) && (m_uCols == uCols))return;
			m_uRows = uRows;
			m_uCols = uCols;
			if((uRows > 0) && (uCols > 0))
			{
				if(m_pData)
					m_pData = (Z *)::realloc(m_pData,uRows * uCols * sizeof(Z));
				else
					m_pData = (Z *)::malloc(uRows * uCols * sizeof(Z));
			} else {
				if(m_pData)::free(m_pData);
				m_pData = 0;
			}
		}
		
		void allocate(unsigned int uRows,unsigned int uCols)
		{
			m_uRows = uRows;
			m_uCols = uCols;
			if((uRows > 0) && (uCols > 0))
				m_pData = (Z *)::malloc(uRows * uCols * sizeof(Z));
			else
				m_pData = 0;
		}
		
		SSMatrixData()
		{
		}
		
		~SSMatrixData()
		{
			if(m_pData)::free(m_pData);
		}
	};
protected:
	SSMatrixData<T> * m_pData;
public:
	unsigned int rows() const
	{
		return m_pData ? m_pData->m_uRows : 0;
	}
	
	unsigned int cols() const
	{
		return m_pData ? m_pData->m_uCols : 0;
	}
	
	unsigned int columns() const
	{
		return m_pData ? m_pData->m_uCols : 0;
	}

	const T * buffer() const
	{
		return m_pData ? m_pData->m_pData : 0;
	}
	
	// be sure to detach() the matrix before writing to it
	T * buffer()
	{
		return m_pData ? m_pData->m_pData : 0;
	}

	const T * row(unsigned int uRow) const
	{
		if(!m_pData)return 0;
		return m_pData->m_pData + (m_pData->m_uCols * uRow);
	}
	
	// results are undefined if the row is out of range!
	// be sure to detach() the matrix before writing to it
	T * row(unsigned int uRow)
	{
		if(!m_pData)return 0;
		return m_pData->m_pData + (m_pData->m_uCols * uRow);
	}

	const T * operator [](unsigned int uRow) const
	{
		if(!m_pData)return 0;
		return m_pData->m_pData + (m_pData->m_uCols * uRow);
	}
	
	// be sure to detach() the matrix before writing to it
	T * operator [](unsigned int uRow)
	{
		if(!m_pData)return 0;
		return m_pData->m_pData + (m_pData->m_uCols * uRow);
	}

	const T & at(unsigned int uRow,unsigned int uCol) const
	{
		return *(m_pData->m_pData + (m_pData->m_uCols * uRow) + uCol);
	}
	
	// be sure to detach() the matrix before writing to it
	T & at(unsigned int uRow,unsigned int uCol)
	{
		return *(m_pData->m_pData + (m_pData->m_uCols * uRow) + uCol);
	}
	
	// be sure to detach() the matrix before writing to it
	T * bufferAt(unsigned int uRow,unsigned int uCol)
	{
		return m_pData->m_pData + (m_pData->m_uCols * uRow) + uCol;
	}
	
	// be sure to detach() the matrix before writing to it
	void setAt(unsigned int uRow,unsigned int uCol,const T &data)
	{
		if(!m_pData)return;
		if(!m_pData->m_pData)return;
		*(m_pData->m_pData + (m_pData->m_uCols * uRow) + uCol) = data;
	}
	
	void resize(unsigned int uRows,unsigned int uCols)
	{
		if(m_pData)
		{
			if(m_pData->m_uRefs == 1)
			{
				m_pData->reallocate(uRows,uCols);
			} else {
				// detach
				m_pData->m_uRefs--;
				m_pData = new SSMatrixData<T>;
				m_pData->allocate(uRows,uCols);
				m_pData->m_uRefs = 1;
			}
		} else {
			m_pData = new SSMatrixData<T>;
			m_pData->allocate(uRows,uCols);
			m_pData->m_uRefs = 1;
		}
	}
	
	void setSize(unsigned int uRows,unsigned int uCols)
	{
		resize(uRows,uCols);
	}

	void clear()
	{
		if(!m_pData)return; // already null
		if(m_pData->m_uRefs == 1)delete m_pData;
		else m_pData->m_uRefs--;
		m_pData = 0;
	}
	
	void setNull()
	{
		clear();
	}

	// returns true if this matrix is null
	bool isNull() const
	{
		return m_pData == 0;
	}

	// returns true if this matrix is empty (either null or zero sized)
	bool isEmpty() const
	{
		if(!m_pData)return true;
		if(!(m_pData->m_pData))return true;
		return false;
	}
	
	bool isSquare() const
	{
		if(!m_pData)return false; // a matter of opinion here
		if(!m_pData->m_pData)return false; // same as above
		return m_pData->m_uRows == m_pData->m_uCols;
	}

	// detaches this matrix from any other reference (copies the data!)
	// you should not need it unless you use threading...
	void detach()
	{
		if(!m_pData)return;
		if(!m_pData->m_pData)return;
		if(m_pData->m_uRefs < 2)return;
		m_pData->m_uRefs--;
		SSMatrixData<T> * d = new SSMatrixData<T>;
		d->allocate(m_pData->m_uRows,m_pData->m_uCols);
		if(m_pData->m_pData && d->m_pData)
			::memcpy(d->m_pData,m_pData->m_pData,m_pData->m_uRows * m_pData->m_uCols * sizeof(T));
		d->m_uRefs = 1;
		m_pData = d;
	}

#define SS_MATRIX_INTERNAL_FAST_DETACH \
	{ \
		m_pData->m_uRefs--; \
		SSMatrixData<T> * d = new SSMatrixData<T>; \
		d->allocate(m_pData->m_uRows,m_pData->m_uCols); \
		d->m_uRefs = 1; \
		m_pData = d; \
	}

	// set this matrix to all zeroes
	// returns false if this matrix is null
	bool setZero()
	{
		if(!m_pData)return false;
		if(!m_pData->m_pData)return false;
		if(m_pData->m_uRefs > 1)SS_MATRIX_INTERNAL_FAST_DETACH
		T * p = m_pData->m_pData;
		T * e = p + (m_pData->m_uRows * m_pData->m_uCols);
		while(p < e)*p++ = (T)0;
		return true;
	}
	
	// returns true if this matrix is composed of all zeroes
	bool isZero() const
	{
		if(!m_pData)return false; // a matter of opinion here
		if(!m_pData->m_pData)return false; // same as above
		T * p = m_pData->m_pData;
		T * e = p + (m_pData->m_uRows * m_pData->m_uCols);
		while(p < e)
		{
			if(*p != (T)0)return false;
			p++;
		}
		return true;
	}
	
	// set this matrix to be an identity matrix
	// or a pseudo-identity if rows != cols
	// returns false if this matrix is null
	bool setIdentity()
	{
		if(!setZero())return false; // setZero() detaches if needed
		T * p = m_pData->m_pData;
		unsigned int dim = m_pData->m_uRows > m_pData->m_uCols ? m_pData->m_uCols : m_pData->m_uRows;
		T * e = p + (dim * dim);
		dim++;
		while(p < e)
		{
			*p = (T)1;
			p += dim;
		}
		return true;
	}

	SSMatrix<T> & operator = (const SSMatrix<T> &src)
	{
		if(m_pData)
		{
			if(m_pData->m_uRefs < 2)
				delete m_pData;
			else
				m_pData->m_uRefs--;
		}
		m_pData = src.m_pData;
		if(m_pData)
			m_pData->m_uRefs++;
		return *this;
	}

	SSMatrix<T> & operator = (const T data[])
	{
		if(!m_pData)return *this;
		if(!m_pData->m_pData)return *this;
		::memcpy(m_pData->m_pData,data,m_pData->m_uRows * m_pData->m_uCols * sizeof(T));
		return *this;
	}

	bool operator == (const SSMatrix<T> &other)
	{
		if(!m_pData || !m_pData->m_pData)
			return (!other.m_pData || !other.m_pData->m_pData);
		if(!other.m_pData || !other.m_pData->m_pData)
			return false;

		T * p1 = m_pData->m_pData;
		T * p2 = other.m_pData->m_pData;
		T * e = p1 + (m_pData->m_uRows * m_pData->m_uCols);
		while(p1 < e)
		{
			if(*p1 != *p2)return false;
			p1++;
			p2++;
		}

		return true;
	}

	bool operator != (const SSMatrix<T> &other)
	{
		if(!m_pData || !m_pData->m_pData)
			return !(!other.m_pData || !other.m_pData->m_pData);
		if(!other.m_pData || !other.m_pData->m_pData)
			return true;

		T * p1 = m_pData->m_pData;
		T * p2 = other.m_pData->m_pData;
		T * e = p1 + (m_pData->m_uRows * m_pData->m_uCols);
		while(p1 < e)
		{
			if(*p1 != *p2)return true;
			p1++;
			p2++;
		}

		return false;
	}


	// returns the transposed matrix of this one
	SSMatrix<T> transposed()
	{
		// | a11 a12 a13 | ---> | a11 a21 |
		// | a21 a22 a23 |      | a12 a22 |
		//                      | a13 a23 |
		if(!m_pData)return SSMatrix<T>();
		if(!m_pData->m_pData)return SSMatrix<T>();
		SSMatrix<T> ret(m_pData->m_uCols,m_pData->m_uRows);
		T * p = m_pData->m_pData;
		T * e = p + (m_pData->m_uRows * m_pData->m_uCols);
		unsigned int uCol = 0;
		while(p < e)
		{
			T * re = p + m_pData->m_uCols;
			T * d = ret.m_pData->m_pData + uCol;
			while(p < re)
			{
				*d = *p;
				p++;
				d += m_pData->m_uCols;
			}
			uCol++;
		}
		return ret;
	}
	
	void transpose()
	{
		*this = transposed();
	}
	
	void setTransposed()
	{
		*this = transposed();
	}

	// results are undefined if the rows are out of range
	void multiplyRow(unsigned int uRow,const T &multiplier)
	{
		if(!m_pData)return;
		if(!m_pData->m_pData)return;
		if(m_pData->m_uRefs > 1)detach();
		T * sd = m_pData->m_pData + (uRow * m_pData->m_uCols);
		T * e = sd + m_pData->m_uCols;
		while(sd < e)
		{
			*sd *= multiplier;
			sd++;
		}
	}

	// results are undefined if the rows are out of range
	void swapRows(unsigned int uSrcRow,unsigned int uDstRow)
	{
		if(!m_pData)return;
		if(!m_pData->m_pData)return;
		if(m_pData->m_uRefs > 1)detach();
		T aux;
		unsigned int c;
		T * sd = m_pData->m_pData + (uSrcRow * m_pData->m_uCols);
		T * dd = m_pData->m_pData + (uDstRow * m_pData->m_uCols);
		T * e = sd + m_pData->m_uCols;
		while(sd < e)
		{
			aux = *sd;
			*sd = *dd;
			*dd = aux;
			sd++;
			dd++;
		}
	}

	// results are undefined if the rows are out of range
	void sumRow(unsigned int uSrcRow,unsigned int uDstRow)
	{
		if(!m_pData)return;
		if(!m_pData->m_pData)return;
		if(m_pData->m_uRefs > 1)detach();
		T * sd = m_pData->m_pData + (uSrcRow * m_pData->m_uCols);
		T * dd = m_pData->m_pData + (uDstRow * m_pData->m_uCols);
		T * e = sd + m_pData->m_uCols;
		while(sd < e)
		{
			*dd += *sd;
			sd++;
			dd++;
		}
	}

	void dump() const
	{
		if(!m_pData)
		{
			printf("Null matrix\n");
			return;
		}
		if(!m_pData->m_pData)
		{
			printf("Null matrix\n");
			return;
		}
		unsigned int c,r;
		
		for(r = 0;r < m_pData->m_uRows;r++)
		{
			for(c = 0;c < m_pData->m_uCols;c++)
			{
				printf("%.8f ",*(m_pData->m_pData + (r * m_pData->m_uRows) + c));
			}
			printf("\n");
		}
	}

	// results are undefined if the rows are out of range
	void sumRowMultiple(unsigned int uSrcRow,unsigned int uDstRow,const T &multiplier)
	{
		if(!m_pData)return;
		if(!m_pData->m_pData)return;
		if(m_pData->m_uRefs > 1)detach();
		//printf("Sum row %d to row %d multiplied by %f\n",uSrcRow,uDstRow,multiplier);
		T * sd = m_pData->m_pData + (uSrcRow * m_pData->m_uCols);
		T * dd = m_pData->m_pData + (uDstRow * m_pData->m_uCols);
		T * e = sd + m_pData->m_uCols;
		while(sd < e)
		{
			*dd += multiplier * *sd;
			sd++;
			dd++;
		}
	}

	// returns a matrix with the specified rows and columns removed
	SSMatrix<T> minorMatrix(unsigned int uRemoveRow,unsigned int uRemoveCol)
	{
		if(!m_pData)return SSMatrix<T>();
		if(!m_pData->m_pData)return SSMatrix<T>();
		if((m_pData->m_uRows < 2) && (m_pData->m_uCols < 2))return SSMatrix<T>();
		if(m_pData->m_uRows <= uRemoveRow)return SSMatrix<T>();
		if(m_pData->m_uCols <= uRemoveCol)return SSMatrix<T>();
		
		SSMatrix<T> ret(m_pData->m_uRows-1,m_pData->m_uCols-1);

		T * d = ret.m_pData->m_pData;
		T * p = m_pData->m_pData;

		unsigned int r;
		unsigned int len;

		if(uRemoveCol == 0)
		{
			p++;
			len = m_pData->m_uCols - 1;
			for(r = 0;r < m_pData->m_uRows;r++)
			{
				if(r != uRemoveRow)
				{
					::memcpy(d,p,len * sizeof(T));
					p += m_pData->m_uCols;
					d += len;
				} else {
					p += m_pData->m_uCols;
				}
			}
		} else if(uRemoveCol == m_pData->m_uCols - 1)
		{
			len = m_pData->m_uCols - 1;
			for(r = 0;r < m_pData->m_uRows;r++)
			{
				if(r != uRemoveRow)
				{
					::memcpy(d,p,len * sizeof(T));
					p += m_pData->m_uCols;
					d += len;
				} else {
					p += m_pData->m_uCols;
				}
			}
		} else {
			len = m_pData->m_uCols - (uRemoveCol + 1);
			for(r = 0;r < m_pData->m_uRows;r++)
			{
				if(r != uRemoveRow)
				{
					::memcpy(d,p,uRemoveCol * sizeof(T));
					p += uRemoveCol + 1;
					d += uRemoveCol;
					::memcpy(d,p,len * sizeof(T));
					p += len;
					d += len;
				} else {
					p += m_pData->m_uCols;
				}
			}
		}
		return ret;
	}

	// this does NOT do row reordering: it just fails if the
	// decomposition cannot be found for exactly *this matrix
	bool luDecomposition(SSMatrix<T> &l,SSMatrix<T> &u)
	{
		if(rows() != cols())return false; // must be square
		l.resize(rows(),cols());
		u.resize(rows(),cols());
		if(!m_pData)return true; // that's it
		if(!m_pData->m_pData)return true; // that's it

		int i,j,k;

		// first row of u is this first row
		// first row of l is 1 0 0 0 0 0 ....

		l.setAt(0,0,(T)1);
		for(j=1;j<m_pData->m_uCols;j++)
			l.setAt(0,j,(T)0);
		for(j=0;j<m_pData->m_uCols;j++)
			u.setAt(0,j,at(0,j));

		for(i=1;i<m_pData->m_uRows;i++)
		{
			for(j=0;j<m_pData->m_uCols;j++)
			{
				T sum = (T)0;
				if(j < i)
				{
					T udiv = u.at(j,j);
					if(matrix_helper_valueNearlyZero(udiv))
					{
						//printf("DECOMPOSITION FAILED BECAUSE OF U(%d,%d) BEING %f\n",j,j,u.at(j,j));
						return false;
					}

					for(k=0;k<j;k++)sum += (T)(l.at(i,k) * u.at(k,j));
					l.setAt(i,j,(1.0 / udiv) * (at(i,j) - sum));
					u.setAt(i,j,(T)0);
				} else {
					for(k=0;k<i;k++)sum += (T)(l.at(i,k) * u.at(k,j));
					u.setAt(i,j,at(i,j) - sum);
					if(i != j)
						l.setAt(i,j,(T)0);
					else
						l.setAt(i,j,(T)1);
				}
			}
		}
		return true;
	}

	T diagonalProduct()
	{
		if(!m_pData)return (T)0;
		if(!m_pData->m_pData)return (T)0;
		T * p = m_pData->m_pData;
		T * e = p + (m_pData->m_uRows * m_pData->m_uCols);
		T prod = (T)1;
		while(p < e)
		{
			prod *= *p;
			p += m_pData->m_uCols + 1;
		}
		return prod;
	}

	// If the matrix is non square, this algorithm always returns 0
	
	T det()
	{
		if(!m_pData)return (T)0;
		if(!m_pData->m_pData)return (T)0;
		if(m_pData->m_uRows != m_pData->m_uCols)return (T)0;
		unsigned int d = m_pData->m_uRows;

		// fast formulas
		T * p = m_pData->m_pData;

		if(d < 1)return (T)0;

		if(d == 1)return p[0];
		

		if(d == 2)return p[0] * p[3] -
		                 p[1] * p[2];

		if(d == 3)return p[0] * p[4] * p[8] +
		                 p[1] * p[5] * p[6] +
		                 p[2] * p[3] * p[7] - 
		                 p[2] * p[4] * p[6] -
		                 p[1] * p[3] * p[8] -
		                 p[0] * p[5] * p[7];

		// first we attemt to compute the determinant by LU decomposition.
		// In fact we need only the U matrix of the decomposition since L has determinant 1.

		SSMatrix<T> l;
		SSMatrix<T> u;
		if(luDecomposition(l,u))
		{
			//printf("DET AT LEVEL %d,%d WOULD BE %f\n",rows(),cols(),u.diagonalProduct());
			return u.diagonalProduct();
		}

		// No way.. try the standard formula...

		// generic n x n matrix determinant
		// Liebnitz formula is
		
		// det(A) = sum_for_j_over_all_permutations (sign_of_permutation) prod_for_i_from_1_to_n (A[i][permutation(j,i)])
		// where permutation(j,i) is a function that returns the i-th element of the j-th permutation of {1,2,3,....n}
		// and sign_of_permutation is -1 if the permutation is odd and 1 otherwise.
		
		// in fact we use a recursive algo that computes the det as
		// a00 * det(minor_0_0) - a01 * det(minor_0_1) + a02 * det(minor_0_2) - a03 * det(minor_0_3)...

		unsigned int c;
		int sign = 1;
		T det = (T)0;
		for(c = 0;c < m_pData->m_uCols;c++)
		{
			SSMatrix<T> sub = minorMatrix(0,c);
			if(sign)
			{
				det += *p * sub.det();
				sign = 0;
			} else {
				det -= *p * sub.det();
				sign = 1;
			}
			p++;
			//printf("sub %d x %d, det=%f sum now %f\n",sub.rows(),sub.cols(),sub.det(),det);
			//sub.dump();
		}

		return det;
	}
	
	T determinant()
	{
		return det();
	}

	// Gauss-Jordan elimination based inverse.
	// This surely isn't the fastest inversion algorithm you can find.. LU factorization
	// cando faster than this....but it's simple, clear and it works.
	// It also uses 3 times the memory used by this matrix (this, a copy to be destroyed and a return matrix)
	SSMatrix<T> inverse()
	{
		
		if(!m_pData)return SSMatrix<T>();
		if(!m_pData->m_pData)return SSMatrix<T>();
		if(m_pData->m_uCols != m_pData->m_uRows)return SSMatrix<T>(); // matrix not square
		unsigned int d = m_pData->m_uRows;
		SSMatrix<T> ret(d,d);
		if(!ret.setIdentity())
			return SSMatrix<T>();

		// we start in this situation

		// | 1 0 0 0 || 3 5 2 8 |
		// | 0 1 0 0 || 1 2 4 2 |
		// | 0 0 1 0 || 5 7 8 5 |
		// | 0 0 0 1 || 2 4 3 1 |

		// and want to get here

		// | 3 4 0 1 || 3 0 0 0 |
		// | 2 3 2 3 || 0 0 3 0 |
		// | 5 6 1 4 || 0 2 0 0 |
		// | 0 2 0 3 || 0 0 0 2 |

		SSMatrix<T> copy(*this);
		copy.detach();

		T * sd = copy.m_pData->m_pData;

		unsigned int c,r,otherr;
		// at column M try to zero out all the elements except one (the pivot row)
		char usedrows[d];
		for(unsigned int i=0;i<d;i++)usedrows[i] = 0;
		for(c = 0;c < d;c++)
		{
			// pick the first non zero element in this column
			T * pElementAtR = sd + c;
			for(r = 0;r < d;r++)
			{
				if(!usedrows[r])
				{
					if(!matrix_helper_valueNearlyZero(*pElementAtR))
					{
						usedrows[r] = 1;
						break;
					}
				}
				pElementAtR += d;
			}
			if(r == d)
			{
				//printf("Matrix is not invertible because of a column of all zeroes\n");
				return SSMatrix<T>(); // column of all zeroes: non invertible
			}
			//printf("At column %d the first non zero element is in row %d and its %f\n",c,r,*pElementAtR);
			// got a non zero element row
			// now sum multiples of this row to the other ones in order to
			// get the first element of the other row to be zero if otherr != c;
			for(otherr = 0;otherr < d;otherr++)
			{
				if(otherr != r)
				{
					//printf("Working at column %d and row %d: value is %f\n",c,otherr,at(otherr,c));
					// in this row we want a zero
					
					// thus if in otherr row we have x
					// and in r we have y
					// we sum y * mult to x
					// in order to get the result 0
					
					// in other words:
					// x = y * -mult
					// or mult = - (x / y)
					// y is non zero for sure
					T x = *(sd + (otherr * d) + c);
					T mult = - (x / *pElementAtR);
					copy.sumRowMultiple(r,otherr,mult);
					ret.sumRowMultiple(r,otherr,mult);
					if(!matrix_helper_valueNearlyZero(*(sd + (otherr * d) + c)))
					{
						//printf("Matrix is either not invertible, close to singular or there are too big computing errors\n");
						//printf("Divided %f by %f and obtained multiplier %f\n",
						//	x,*pElementAtR,mult);
						//printf("By summing %f * %f to %f should get 0 but got %.20f\n",
						//	*pElementAtR,mult,x,(*(sd + (otherr * d) + c)));
						return SSMatrix<T>(); // either bad data types or matrix really close to singular
					}
				} // else this is our pivot row
			}
		}

		// now we have something like

		// | 3 4 0 1 || 3 0 0 0 |
		// | 2 3 2 3 || 0 0 3 0 |
		// | 5 6 1 4 || 0 2 0 0 |
		// | 0 2 0 3 || 0 0 0 2 |

		// need to swap the rows in order to get to

		// | 3 4 0 1 || 3 0 0 0 |
		// | 5 6 1 4 || 0 2 0 0 |
		// | 2 3 2 3 || 0 0 3 0 |
		// | 0 2 0 3 || 0 0 0 2 |
		
		for(c = 0;c < d;c++)
		{
			// find the row that has a nonzero element in this column
			// and swap it with row c (unless already c == r obviously)
			T * pElementAtR = sd + c;
			for(r = 0;r < d;r++)
			{
				if(!matrix_helper_valueNearlyZero(*pElementAtR))
					break;
				pElementAtR += d;
			}
			if(r == d)
			{
				//printf("Matrix is not invertible because of an internal error\n");
				return SSMatrix<T>(); // should never happen
			}
			if(r != c)
			{
				copy.swapRows(r,c);
				ret.swapRows(r,c);
			}
		}

		// now we have something like

		// | 3 4 0 1 || 3 0 0 0 |
		// | 5 6 1 4 || 0 2 0 0 |
		// | 2 3 2 3 || 0 0 3 0 |
		// | 0 2 0 3 || 0 0 0 2 |

		// now just divide each row in order to get to

		// | 3 4 0 1 || 1 0 0 0 |
		// | 5 6 1 4 || 0 1 0 0 |
		// | 2 3 2 3 || 0 0 1 0 |
		// | 0 2 0 3 || 0 0 0 1 |

		unsigned int step = d + 1;

		for(c = 0;c < d;c++)
		{
			if(matrix_helper_valueNearlyZero(*sd))
			{
				//printf("Matrix is either not invertible, close to singular or there are too big computing errors\n");
				return SSMatrix<T>(); // either bad data types or matrix really close to singular
			}
			ret.multiplyRow(c,((T)1) / *sd); // we should check for nearly-zero division here...
			sd += step;
		}

		return ret;
	}

	void invert()
	{
		*this = inverse();
	}
	
	void setInverse()
	{
		*this = inverse();
	}

	// allocates a null matrix
	SSMatrix()
	: m_pData(0)
	{
	}
	
	// allocates an uninitialized matrix with the specified dimensions
	SSMatrix(unsigned int uRows,unsigned int uCols)
	{
		m_pData = new SSMatrixData<T>;
		m_pData->allocate(uRows,uCols);
		m_pData->m_uRefs = 1;
	}
	
	// allocates a shared copy of the specified matrix
	SSMatrix(const SSMatrix<T> &src)
	{
		m_pData = src.m_pData;
		if(m_pData)m_pData->m_uRefs++;
	}
	
	~SSMatrix()
	{
		if(m_pData)
		{
			if(m_pData->m_uRefs == 1)delete m_pData;
			else m_pData->m_uRefs--;
		}
	}

};


template<class T> SSMatrix<T> operator * (const SSMatrix<T> &a,const SSMatrix<T> &b)
{
	if(a.isNull())return SSMatrix<T>();
	if(b.isNull())return SSMatrix<T>();
	// | a11 a12 a13 a14 || b11 b12 b13 |   | c11 c12 c13 |
	// | a21 a22 a23 a24 || b21 b22 b23 |   | c21 c22 c23 |
	//                    | b31 b32 b33 | =
	//                    | b41 b42 b43 |  
	if(a.cols() != b.rows())return SSMatrix<T>();

	const T * pa = a.buffer();
	const T * pae = pa + a.rows() * a.cols();

	unsigned int ac = a.cols();
	unsigned int bc = b.cols();

	SSMatrix<T> ret(a.rows(),b.cols());
	
	T * d = ret.buffer();
	unsigned int c = 0;
	while(pa < pae)
	{
		const T * pb = b.buffer();
		const T * pbe = pb + bc;
		while(pb < pbe)
		{
			const T * ptra = pa;
			const T * ptrb = pb;
			const T * ptrae = pa + ac;
			T sum = (T)0;
			while(ptra < ptrae)
			{
				sum += *ptra * *ptrb;
				ptra++;
				ptrb += bc;
			}
			*d = sum;
			d++;
			pb++;
		}
		pa += ac;
	}

	return ret;
}


#endif //!_SS_MATRIX_H_

