/*!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
! Copyright 2010.  Los Alamos National Security, LLC. This material was    !
! produced under U.S. Government contract DE-AC52-06NA25396 for Los Alamos !
! National Laboratory (LANL), which is operated by Los Alamos National     !
! Security, LLC for the U.S. Department of Energy. The U.S. Government has !
! rights to use, reproduce, and distribute this software.  NEITHER THE     !
! GOVERNMENT NOR LOS ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY,     !
! EXPRESS OR IMPLIED, OR ASSUMES ANY LIABILITY FOR THE USE OF THIS         !
! SOFTWARE.  If software is modified to produce derivative works, such     !
! modified software should be clearly marked, so as not to confuse it      !
! with the version available from LANL.                                    !
!                                                                          !
! Additionally, this program is free software; you can redistribute it     !
! and/or modify it under the terms of the GNU General Public License as    !
! published by the Free Software Foundation; version 2.0 of the License.   !
! Accordingly, this program is distributed in the hope that it will be     !
! useful, but WITHOUT ANY WARRANTY; without even the implied warranty of   !
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General !
! Public License for more details.                                         !
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!*/

#if REALSIZE==4
  #define REAL float
#elif REALSIZE==8
  #define REAL double
#endif

#define ZERO (REAL)0.0
#define ONE (REAL)1.0

__global__ void MatrixFastTraceX2Kernel(REAL *A, REAL *trace, int M, int num_threads);
__global__ void FastDotProductKernel(REAL *A, REAL *B, REAL *dot, int M, int num_threads);
__global__ void SubtractMatrixKernel(REAL *A, int m, int n, REAL *B, REAL *C, int NumThreads);
__global__ void AddIdentityKernel(REAL *A, int M, int N, int num_threads);
__global__ void MultiplyScalarMatrixKernel(REAL Scalar, REAL *A, int m, int n, REAL *B, int num_threads);
__global__ void SubtractVectorKernel(REAL k, REAL *A, REAL *B, REAL *C, int M, int num_threads);
__global__ void CGIterateKernel(int M, int N, REAL *p0, REAL *tmpmat, REAL *r0, REAL *bo, REAL *error2_ptr, int num_threads);
__global__ void AddMatrixKernel(REAL *A, int m, int n, REAL *B, REAL *C, int num_threads);
__global__ void MatrixFastTraceKernel(REAL *A, REAL *result, int m, int num_threads);
__global__ void AddVectorKernel(REAL k, REAL *A, REAL *B, REAL *C, int M, int num_threads);
__global__ void MatrixTraceKernel(REAL *A, REAL *trace, int m, int num_threads);
