72 #include <cuda_runtime.h>
88 using namespace lsst::afw::gpu;
89 using namespace lsst::afw::gpu::detail;
91 namespace afwMath = lsst::afw::math;
92 namespace mathDetailGpu = lsst::afw::math::detail::gpu;
102 const int shMemBytesUsed = 200;
111 int bufferSize = (filterW + blockSizeX - 1) * (filterH + blockSizeY - 1) * pixSize;
113 return shMemSize - shMemBytesUsed - bufferSize > 0;
122 int imgBufferSize = (filterW + blockSizeX - 1) * (filterH + blockSizeY - 1) * pixSize;
123 int mskBufferSize = (filterW + blockSizeX - 1) * (filterH + blockSizeY - 1) *
sizeof(
MskPixel);
125 int memRemaining = shMemSize - shMemBytesUsed - imgBufferSize - mskBufferSize ;
127 return memRemaining > 0;
136 const int coeffN = order + 1;
137 const int coeffPadding = coeffN + 1 - (coeffN % 2);
138 int paramN = (order + 1) * (order + 2) / 2;
140 int yCoeffsAll = coeffPadding * blockSizeX *
sizeof(double);
141 int xChebyAll = coeffPadding * blockSizeX *
sizeof(double);
142 int smemParams = paramN *
sizeof(double);
144 int smemKernelSum = kernelN *
sizeof(double);
145 int smemSfnPtr = kernelN *
sizeof(
double*);
147 int memRemainingSfn = shMemSize - shMemBytesUsed - yCoeffsAll - xChebyAll - smemParams ;
148 int memRemainingNorm = shMemSize - shMemBytesUsed - smemKernelSum - smemSfnPtr;
150 return min(memRemainingSfn, memRemainingNorm) > 0;
156 int CalcBlockCount(
int multiprocCount)
158 if (multiprocCount < 12)
return multiprocCount * 4;
159 if (multiprocCount < 24)
return multiprocCount * 2;
160 return multiprocCount;
170 GpuMemOwner<int> resGpu;
173 gpu::CallTestGpuKernel(resGpu.ptr);
175 resGpu.CopyFromGpu(res);
184 template <
typename ResultT,
typename InT>
187 int n = int(images.size());
188 vector<ResultT>
sum(n);
189 for (
int i = 0; i < n; i++) {
190 ResultT totalSum = 0;
191 int h = images[i].height;
192 int w = images[i].width;
194 for (
int y = 0;
y < h;
y++) {
196 for (
int x = 0;
x <
w;
x++) {
197 rowSum += images[i].Pixel(
x,
y);
220 template <
typename OutPixelT,
typename InPixelT>
221 void GPU_ConvolutionImage_LC_Img(
223 const vector<double>& colPos,
224 const vector<double>& rowPos,
225 const std::vector< afwMath::Kernel::SpatialFunctionPtr >& sFn,
226 const vector<double*>& sFnValGPUPtr,
231 int kernelW,
int kernelH,
232 const vector<double>& basisKernelSums,
237 const int kernelN = sFn.size();
240 GpuMemOwner<InPixelT > inImageGPU;
241 inImageGPU.Transfer(inImage);
242 if (inImageGPU.ptr == NULL) {
243 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU for input image");
246 GpuMemOwner<OutPixelT> outImageGPU;
247 outImageGPU.Alloc( outImage.
Size());
248 if (outImageGPU.ptr == NULL) {
249 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU for output image");
252 GpuMemOwner<double> colPosGPU_Owner;
253 colPosGPU_Owner.TransferVec(colPos);
254 if (colPosGPU_Owner.ptr == NULL) {
256 "Not enough memory on GPU for row coordinate tranformation data");
258 GpuMemOwner<double> rowPosGPU_Owner;
259 rowPosGPU_Owner.TransferVec(rowPos);
260 if (rowPosGPU_Owner.ptr == NULL) {
262 "Not enough memory on GPU for column coordinate tranformation data");
264 vector< double* > sFnParamsGPUPtr(kernelN);
265 vector< GpuMemOwner<double> > sFnParamsGPU_Owner(kernelN);
268 for (
int i = 0; i < kernelN; i++) {
269 std::vector<double> spatialParams = sFn[i]->getParameters();
270 sFnParamsGPUPtr[i] = sFnParamsGPU_Owner[i].TransferVec(spatialParams);
271 if (sFnParamsGPUPtr[i] == NULL) {
272 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU for spatial function parameters");
278 for (
int i = 0; i < kernelN; i++)
286 gpu::Call_PolynomialImageValues(
289 sFnParamsGPU_Owner[i].ptr,
296 cudaError_t cuda_err = cudaGetLastError();
297 if (cuda_err != cudaSuccess) {
298 throw LSST_EXCEPT(GpuRuntimeError,
"GPU calculation failed to run");
307 gpu::Call_ChebyshevImageValues(
310 sFnParamsGPU_Owner[i].ptr,
313 xyRange.getMinX(), xyRange.getMinY(), xyRange.getMaxX(), xyRange.getMaxY(),
318 cudaError_t cuda_err = cudaGetLastError();
319 if (cuda_err != cudaSuccess) {
320 throw LSST_EXCEPT(GpuRuntimeError,
"GPU calculation failed to run");
324 cudaThreadSynchronize();
325 cudaError_t cuda_err = cudaGetLastError();
326 if (cuda_err != cudaSuccess) {
327 throw LSST_EXCEPT(GpuRuntimeError,
"GPU calculation failed to run");
334 GpuMemOwner<double> basisKernelSumsGPU;
335 basisKernelSumsGPU.TransferVec(basisKernelSums);
337 bool isDivideByZero =
false;
338 GpuMemOwner<bool> isDivideByZeroGPU;
339 isDivideByZeroGPU.Transfer(&isDivideByZero, 1);
341 gpu::Call_NormalizationImageValues(
344 basisKernelSumsGPU.ptr,
345 isDivideByZeroGPU.ptr,
349 cudaThreadSynchronize();
350 if (cudaGetLastError() != cudaSuccess) {
351 throw LSST_EXCEPT(GpuRuntimeError,
"GPU calculation failed to run");
353 CopyFromGpu<bool>(&isDivideByZero, isDivideByZeroGPU.ptr, 1);
354 if (isDivideByZero) {
355 throw LSST_EXCEPT(pexExcept::OverflowError,
"Cannot normalize; kernel sum is 0");
360 mathDetailGpu::Call_ConvolutionKernel_LC_Img(
362 basisKernelsListGPU, kernelN,
370 cudaThreadSynchronize();
371 if (cudaGetLastError() != cudaSuccess) {
372 throw LSST_EXCEPT(GpuRuntimeError,
"GPU calculation failed to run");
375 CopyFromGpu(outImage.
img, outImageGPU.ptr, outImage.
Size() );
381 template <
typename OutPixelT,
typename InPixelT>
382 void GPU_ConvolutionImage_LinearCombinationKernel(
384 vector<double> colPos,
385 vector<double> rowPos,
386 std::vector< afwMath::Kernel::SpatialFunctionPtr > sFn,
393 assert(basisKernels.size() == sFn.size());
395 int outWidth = outImage.
width;
396 int outHeight = outImage.
height;
398 const int kernelN = sFn.size();
399 const int kernelW = basisKernels[0].width;
400 const int kernelH = basisKernels[0].height;
401 const int kernelSize = kernelW * kernelH;
403 for (
int i = 0; i < kernelN; i++) {
404 assert(kernelW == basisKernels[i].width);
405 assert(kernelH == basisKernels[i].height);
409 GpuMemOwner<KerPixel > basisKernelsGPU;
410 basisKernelsGPU.Alloc(kernelSize * kernelN);
412 for (
int i = 0; i < kernelN; i++) {
413 KerPixel* kernelBeg = basisKernelsGPU.ptr + (kernelSize * i);
421 vector< double* > sFnValGPUPtr(kernelN);
422 vector< GpuMemOwner<double > > sFnValGPU_Owner(kernelN);
424 for (
int i = 0; i < kernelN; i++) {
425 sFnValGPUPtr[i] = sFnValGPU_Owner[i].Alloc(outWidth * outHeight);
426 if (sFnValGPUPtr[i] == NULL) {
427 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU for spatial function values");
430 GpuMemOwner<double*> sFnValGPU;
431 sFnValGPU.TransferVec(sFnValGPUPtr);
433 GpuMemOwner<double > normGPU_Owner;
434 vector<double> basisKernelSums(kernelN);
437 normGPU_Owner.Alloc(outWidth * outHeight);
438 if (normGPU_Owner.ptr == NULL) {
439 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU for normalization coefficients");
443 basisKernelSums = SumsOfImages<double, KerPixel>(basisKernels);
446 GPU_ConvolutionImage_LC_Img(
463 #define INSTANTIATE_GPU_ConvolutionImage_LinearCombinationKernel(OutPixelT,InPixelT) \
464 template void GPU_ConvolutionImage_LinearCombinationKernel<OutPixelT,InPixelT>( \
465 GpuBuffer2D<InPixelT>& inImage, \
466 vector<double> colPos, \
467 vector<double> rowPos, \
468 std::vector< afwMath::Kernel::SpatialFunctionPtr > sFn, \
469 GpuBuffer2D<OutPixelT>& outImage, \
470 std::vector< GpuBuffer2D<KerPixel> >& basisKernels, \
471 SpatialFunctionType_t sfType, \
475 template <
typename OutPixelT,
typename InPixelT>
476 void GPU_ConvolutionMI_LinearCombinationKernel(
480 vector<double> colPos,
481 vector<double> rowPos,
482 std::vector< afwMath::Kernel::SpatialFunctionPtr > sFn,
491 assert(basisKernels.size() == sFn.size());
492 assert(outImageImg.
width == outImageVar.
width);
493 assert(outImageImg.
width == outImageMsk.
width);
497 int outWidth = outImageImg.
width;
498 int outHeight = outImageImg.
height;
500 const int kernelN = sFn.size();
501 const int kernelW = basisKernels[0].width;
502 const int kernelH = basisKernels[0].height;
503 const int kernelSize = kernelW * kernelH;
505 for (
int i = 0; i < kernelN; i++) {
506 assert(kernelW == basisKernels[i].width);
507 assert(kernelH == basisKernels[i].height);
511 GpuMemOwner<KerPixel > basisKernelsGPU;
512 basisKernelsGPU.Alloc(kernelSize * kernelN);
514 for (
int i = 0; i < kernelN; i++) {
515 KerPixel* kernelBeg = basisKernelsGPU.ptr + (kernelSize * i);
523 vector< double* > sFnValGPUPtr(kernelN);
524 vector< GpuMemOwner<double > > sFnValGPU_Owner(kernelN);
526 for (
int i = 0; i < kernelN; i++) {
527 sFnValGPUPtr[i] = sFnValGPU_Owner[i].Alloc(outWidth * outHeight);
528 if (sFnValGPUPtr[i] == NULL) {
529 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU for spatial function values");
532 GpuMemOwner<double*> sFnValGPU;
533 sFnValGPU.TransferVec(sFnValGPUPtr);
536 GpuMemOwner<double > normGPU_Owner;
537 std::vector<KerPixel> basisKernelSums(kernelN);
540 normGPU_Owner.Alloc(outWidth * outHeight);
541 if (normGPU_Owner.ptr == NULL) {
542 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU for normalization coefficients");
545 basisKernelSums = SumsOfImages<double, KerPixel>(basisKernels);
548 GPU_ConvolutionImage_LC_Img(
564 GpuMemOwner<VarPixel> inImageGPUVar;
565 inImageGPUVar.Transfer(inImageVar);
566 if (inImageGPUVar.ptr == NULL) {
567 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU for input variance");
569 GpuMemOwner<MskPixel> inImageGPUMsk;
570 inImageGPUMsk.Transfer(inImageMsk);
571 if (inImageGPUMsk.ptr == NULL) {
572 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU for input mask");
576 GpuMemOwner<VarPixel > outImageGPUVar;
577 outImageGPUVar.Alloc( outImageVar.
Size());
578 if (outImageGPUVar.ptr == NULL) {
579 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU for output variance");
581 GpuMemOwner<MskPixel > outImageGPUMsk;
582 outImageGPUMsk.Alloc( outImageMsk.
Size());
583 if (outImageGPUMsk.ptr == NULL) {
584 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU for output mask");
590 mathDetailGpu::Call_ConvolutionKernel_LC_Var(
591 inImageGPUVar.ptr, inImageVar.
width, inImageVar.
height,
593 basisKernelsGPU.ptr, kernelN,
602 cudaThreadSynchronize();
603 if (cudaGetLastError() != cudaSuccess)
604 throw LSST_EXCEPT(GpuRuntimeError,
"GPU calculation failed to run");
606 CopyFromGpu(outImageVar.
img, outImageGPUVar.ptr, outImageVar.
Size() );
607 CopyFromGpu(outImageMsk.
img, outImageGPUMsk.ptr, outImageMsk.
Size() );
610 #define INSTANTIATE_GPU_ConvolutionMI_LinearCombinationKernel(OutPixelT,InPixelT) \
611 template void GPU_ConvolutionMI_LinearCombinationKernel<OutPixelT,InPixelT>( \
612 GpuBuffer2D<InPixelT>& inImageImg, \
613 GpuBuffer2D<VarPixel>& inImageVar, \
614 GpuBuffer2D<MskPixel>& inImageMsk, \
615 vector<double> colPos, \
616 vector<double> rowPos, \
617 std::vector< afwMath::Kernel::SpatialFunctionPtr > sFn, \
618 GpuBuffer2D<OutPixelT>& outImageImg, \
619 GpuBuffer2D<VarPixel>& outImageVar, \
620 GpuBuffer2D<MskPixel>& outImageMsk, \
621 std::vector< GpuBuffer2D<KerPixel> >& basisKernels, \
622 SpatialFunctionType_t sfType, \
627 template <
typename OutPixelT,
typename InPixelT>
628 void GPU_ConvolutionImage_SpatiallyInvariantKernel(
634 int kernelW = kernel.
width;
635 int kernelH = kernel.
height;
637 GpuMemOwner<InPixelT> inImageGPU;
638 inImageGPU.Transfer(inImage);
639 if (inImageGPU.ptr == NULL) {
640 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU for input image");
645 GpuMemOwner<KerPixel > basisKernelGPU;
646 basisKernelGPU.Transfer(kernel);
647 if (basisKernelGPU.ptr == NULL) {
648 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU available for kernel");
651 vector< OutPixelT* > outImageGPUPtr(1);
652 vector< GpuMemOwner<OutPixelT> > outImageGPU_Owner(1);
654 outImageGPUPtr[0] = outImageGPU_Owner[0].Alloc( outImage.
Size());
655 if (outImageGPUPtr[0] == NULL) {
656 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU available for output image");
658 GpuMemOwner<OutPixelT*> outImageGPU;
659 outImageGPU.TransferVec(outImageGPUPtr);
664 mathDetailGpu::Call_SpatiallyInvariantImageConvolutionKernel<OutPixelT, InPixelT>(
666 basisKernelGPU.ptr, 1,
672 cudaThreadSynchronize();
673 if (cudaGetLastError() != cudaSuccess) {
674 throw LSST_EXCEPT(GpuRuntimeError,
"GPU calculation failed to run");
676 CopyFromGpu(outImage.
img, outImageGPUPtr[0], outImage.
Size() );
679 #define INSTANTIATE_GPU_ConvolutionImage_SpatiallyInvariantKernel(OutPixelT,InPixelT) \
680 template void GPU_ConvolutionImage_SpatiallyInvariantKernel<OutPixelT,InPixelT>( \
681 GpuBuffer2D<InPixelT>& inImage, \
682 GpuBuffer2D<OutPixelT>& outImage, \
683 GpuBuffer2D<KerPixel>& kernel \
686 template <
typename OutPixelT,
typename InPixelT>
687 void GPU_ConvolutionMI_SpatiallyInvariantKernel(
697 int kernelW = kernel.
width;
698 int kernelH = kernel.
height;
700 GpuMemOwner<InPixelT> inImageGPUImg;
701 inImageGPUImg.Transfer(inImageImg);
702 if (inImageGPUImg.ptr == NULL) {
703 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU available for input image");
705 GpuMemOwner<VarPixel> inImageGPUVar;
706 inImageGPUVar.Transfer(inImageVar);
707 if (inImageGPUVar.ptr == NULL) {
708 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU available for input variance");
710 GpuMemOwner<MskPixel> inImageGPUMsk;
711 inImageGPUMsk.Transfer(inImageMsk);
712 if (inImageGPUMsk.ptr == NULL) {
713 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU available for input mask");
718 GpuMemOwner<KerPixel > basisKernelGPU;
719 basisKernelGPU.Transfer(kernel);
720 if (basisKernelGPU.ptr == NULL)
721 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU available for kernel");
724 vector< OutPixelT* > outImageGPUPtrImg(1);
725 vector< VarPixel* > outImageGPUPtrVar(1);
726 vector< MskPixel* > outImageGPUPtrMsk(1);
728 vector< GpuMemOwner<OutPixelT> > outImageGPU_OwnerImg(1);
729 vector< GpuMemOwner<VarPixel > > outImageGPU_OwnerVar(1);
730 vector< GpuMemOwner<MskPixel > > outImageGPU_OwnerMsk(1);
732 outImageGPUPtrImg[0] = outImageGPU_OwnerImg[0].Alloc( outImageImg.
Size());
733 if (outImageGPUPtrImg[0] == NULL) {
734 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU available for output image");
736 outImageGPUPtrVar[0] = outImageGPU_OwnerVar[0].Alloc( outImageVar.
Size());
737 if (outImageGPUPtrVar[0] == NULL) {
738 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU available for output variance");
740 outImageGPUPtrMsk[0] = outImageGPU_OwnerMsk[0].Alloc( outImageMsk.
Size());
741 if (outImageGPUPtrMsk[0] == NULL) {
742 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU available for output mask");
745 GpuMemOwner<OutPixelT*> outImageGPUImg;
746 outImageGPUImg.TransferVec(outImageGPUPtrImg);
747 GpuMemOwner<VarPixel*> outImageGPUVar;
748 outImageGPUVar.TransferVec(outImageGPUPtrVar);
749 GpuMemOwner<MskPixel*> outImageGPUMsk;
750 outImageGPUMsk.TransferVec(outImageGPUPtrMsk);
754 mathDetailGpu::Call_SpatiallyInvariantImageConvolutionKernel<OutPixelT, InPixelT>(
755 inImageGPUImg.ptr, inImageImg.
width, inImageImg.
height,
756 basisKernelGPU.ptr, 1,
763 for (
int y = 0;
y < kernelH;
y++) {
764 for (
int x = 0;
x < kernelW;
x++) {
769 CopyFromGpu(outImageImg.
img, outImageGPUPtrImg[0], outImageImg.
Size() );
771 basisKernelGPU.CopyToGpu(kernel);
775 mathDetailGpu::Call_SpatiallyInvariantImageConvolutionKernel<VarPixel, VarPixel>(
776 inImageGPUVar.ptr, inImageVar.
width, inImageVar.
height,
777 basisKernelGPU.ptr, 1,
784 cudaThreadSynchronize();
785 if (cudaGetLastError() != cudaSuccess) {
786 throw LSST_EXCEPT(GpuRuntimeError,
"GPU variance calculation failed to run");
788 mathDetailGpu::Call_SpatiallyInvariantMaskConvolutionKernel(
789 inImageGPUMsk.ptr, inImageMsk.
width, inImageMsk.
height,
790 basisKernelGPU.ptr, 1,
796 cudaThreadSynchronize();
797 if (cudaGetLastError() != cudaSuccess) {
798 throw LSST_EXCEPT(GpuRuntimeError,
"GPU mask calculation failed to run");
800 CopyFromGpu(outImageVar.
img, outImageGPUPtrVar[0], outImageVar.
Size() );
801 CopyFromGpu(outImageMsk.
img, outImageGPUPtrMsk[0], outImageMsk.
Size() );
804 #define INSTANTIATE_GPU_ConvolutionMI_SpatiallyInvariantKernel(OutPixelT,InPixelT) \
805 template void GPU_ConvolutionMI_SpatiallyInvariantKernel<OutPixelT,InPixelT>( \
806 GpuBuffer2D<InPixelT>& inImageImg, \
807 GpuBuffer2D<VarPixel>& inImageVar, \
808 GpuBuffer2D<MskPixel>& inImageMsk, \
809 GpuBuffer2D<OutPixelT>& outImageImg, \
810 GpuBuffer2D<VarPixel>& outImageVar, \
811 GpuBuffer2D<MskPixel>& outImageMsk, \
812 GpuBuffer2D<KerPixel>& kernel \
820 #define INSTANTIATE(OutPixelT,InPixelT) \
821 INSTANTIATE_GPU_ConvolutionImage_LinearCombinationKernel(OutPixelT,InPixelT) \
822 INSTANTIATE_GPU_ConvolutionMI_LinearCombinationKernel(OutPixelT,InPixelT) \
823 INSTANTIATE_GPU_ConvolutionImage_SpatiallyInvariantKernel(OutPixelT,InPixelT) \
824 INSTANTIATE_GPU_ConvolutionMI_SpatiallyInvariantKernel(OutPixelT,InPixelT)
2-dimensional weighted sum of Chebyshev polynomials of the first kind.
Declare the Kernel class and subclasses.
Class for representing an image or 2D array in general)
bool IsSufficientSharedMemoryAvailable_ForImgBlock(int filterW, int filterH, int pixSize)
int getOrder() const
Get the polynomial order.
2-dimensional polynomial function with cross terms
lsst::afw::geom::Box2D getXYRange() const
Get x,y range.
Define a collection of useful Functions.
contains GpuBuffer2D class (for simple handling of images or 2D arrays)
boost::enable_if< typename ExpressionTraits< Scalar >::IsScalar, Scalar >::type sum(Scalar const &scalar)
table::Key< table::Array< Kernel::Pixel > > image
int GetCudaCurSMSharedMemorySize()
returns shared memory size per block of currently selected cuda device
Set up for convolution, calls GPU convolution kernels.
void TestGpuKernel(int &ret1, int &ret2)
bool IsSufficientSharedMemoryAvailable_ForSfn(int order, int kernelN)
Functions to help managing setup for GPU kernels.
int GetCudaCurSMCount()
returns the number of streaming multiprocessors of currently selected cuda device ...
#define LSST_EXCEPT(type,...)
A floating-point coordinate rectangle geometry.
Functions to query the properties of currently selected GPU device.
PixelT & Pixel(int x, int y)
Functions and a class to help allocating GPU global memory and transferring data to and from a GPU...
Definition of default types for Masks and Variance Images.
lsst::afw::image::MaskPixel MskPixel
bool IsSufficientSharedMemoryAvailable_ForImgAndMaskBlock(int filterW, int filterH, int pixSize)