74 #include <cuda_runtime.h>
90 using namespace lsst::afw::gpu;
91 using namespace lsst::afw::gpu::detail;
93 namespace afwMath = lsst::afw::math;
94 namespace mathDetailGpu = lsst::afw::math::detail::gpu;
104 const int shMemBytesUsed = 200;
113 int bufferSize = (filterW + blockSizeX - 1) * (filterH + blockSizeY - 1) * pixSize;
115 return shMemSize - shMemBytesUsed - bufferSize > 0;
124 int imgBufferSize = (filterW + blockSizeX - 1) * (filterH + blockSizeY - 1) * pixSize;
125 int mskBufferSize = (filterW + blockSizeX - 1) * (filterH + blockSizeY - 1) *
sizeof(
MskPixel);
127 int memRemaining = shMemSize - shMemBytesUsed - imgBufferSize - mskBufferSize ;
129 return memRemaining > 0;
138 const int coeffN = order + 1;
139 const int coeffPadding = coeffN + 1 - (coeffN % 2);
140 int paramN = (order + 1) * (order + 2) / 2;
142 int yCoeffsAll = coeffPadding * blockSizeX *
sizeof(double);
143 int xChebyAll = coeffPadding * blockSizeX *
sizeof(double);
144 int smemParams = paramN *
sizeof(double);
146 int smemKernelSum = kernelN *
sizeof(double);
147 int smemSfnPtr = kernelN *
sizeof(
double*);
149 int memRemainingSfn = shMemSize - shMemBytesUsed - yCoeffsAll - xChebyAll - smemParams ;
150 int memRemainingNorm = shMemSize - shMemBytesUsed - smemKernelSum - smemSfnPtr;
152 return min(memRemainingSfn, memRemainingNorm) > 0;
158 int CalcBlockCount(
int multiprocCount)
160 if (multiprocCount < 12)
return multiprocCount * 4;
161 if (multiprocCount < 24)
return multiprocCount * 2;
162 return multiprocCount;
172 GpuMemOwner<int> resGpu;
175 gpu::CallTestGpuKernel(resGpu.ptr);
177 resGpu.CopyFromGpu(res);
186 template <
typename ResultT,
typename InT>
189 int n = int(images.size());
190 vector<ResultT> sum(n);
191 for (
int i = 0; i < n; i++) {
192 ResultT totalSum = 0;
193 int h = images[i].height;
194 int w = images[i].width;
196 for (
int y = 0;
y < h;
y++) {
198 for (
int x = 0;
x <
w;
x++) {
199 rowSum += images[i].Pixel(
x,
y);
222 template <
typename OutPixelT,
typename InPixelT>
223 void GPU_ConvolutionImage_LC_Img(
225 const vector<double>& colPos,
226 const vector<double>& rowPos,
227 const std::vector< afwMath::Kernel::SpatialFunctionPtr >& sFn,
228 const vector<double*>& sFnValGPUPtr,
233 int kernelW,
int kernelH,
234 const vector<double>& basisKernelSums,
239 const int kernelN = sFn.size();
242 GpuMemOwner<InPixelT > inImageGPU;
243 inImageGPU.Transfer(inImage);
244 if (inImageGPU.ptr == NULL) {
245 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU for input image");
248 GpuMemOwner<OutPixelT> outImageGPU;
249 outImageGPU.Alloc( outImage.
Size());
250 if (outImageGPU.ptr == NULL) {
251 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU for output image");
254 GpuMemOwner<double> colPosGPU_Owner;
255 colPosGPU_Owner.TransferVec(colPos);
256 if (colPosGPU_Owner.ptr == NULL) {
258 "Not enough memory on GPU for row coordinate tranformation data");
260 GpuMemOwner<double> rowPosGPU_Owner;
261 rowPosGPU_Owner.TransferVec(rowPos);
262 if (rowPosGPU_Owner.ptr == NULL) {
264 "Not enough memory on GPU for column coordinate tranformation data");
266 vector< double* > sFnParamsGPUPtr(kernelN);
267 vector< GpuMemOwner<double> > sFnParamsGPU_Owner(kernelN);
270 for (
int i = 0; i < kernelN; i++) {
271 std::vector<double> spatialParams = sFn[i]->getParameters();
272 sFnParamsGPUPtr[i] = sFnParamsGPU_Owner[i].TransferVec(spatialParams);
273 if (sFnParamsGPUPtr[i] == NULL) {
274 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU for spatial function parameters");
280 for (
int i = 0; i < kernelN; i++)
288 gpu::Call_PolynomialImageValues(
291 sFnParamsGPU_Owner[i].ptr,
298 cudaError_t cuda_err = cudaGetLastError();
299 if (cuda_err != cudaSuccess) {
300 throw LSST_EXCEPT(GpuRuntimeError,
"GPU calculation failed to run");
309 gpu::Call_ChebyshevImageValues(
312 sFnParamsGPU_Owner[i].ptr,
315 xyRange.getMinX(), xyRange.getMinY(), xyRange.getMaxX(), xyRange.getMaxY(),
320 cudaError_t cuda_err = cudaGetLastError();
321 if (cuda_err != cudaSuccess) {
322 throw LSST_EXCEPT(GpuRuntimeError,
"GPU calculation failed to run");
326 cudaThreadSynchronize();
327 cudaError_t cuda_err = cudaGetLastError();
328 if (cuda_err != cudaSuccess) {
329 throw LSST_EXCEPT(GpuRuntimeError,
"GPU calculation failed to run");
336 GpuMemOwner<double> basisKernelSumsGPU;
337 basisKernelSumsGPU.TransferVec(basisKernelSums);
339 bool isDivideByZero =
false;
340 GpuMemOwner<bool> isDivideByZeroGPU;
341 isDivideByZeroGPU.Transfer(&isDivideByZero, 1);
343 gpu::Call_NormalizationImageValues(
346 basisKernelSumsGPU.ptr,
347 isDivideByZeroGPU.ptr,
351 cudaThreadSynchronize();
352 if (cudaGetLastError() != cudaSuccess) {
353 throw LSST_EXCEPT(GpuRuntimeError,
"GPU calculation failed to run");
355 CopyFromGpu<bool>(&isDivideByZero, isDivideByZeroGPU.ptr, 1);
356 if (isDivideByZero) {
357 throw LSST_EXCEPT(pexExcept::OverflowError,
"Cannot normalize; kernel sum is 0");
362 mathDetailGpu::Call_ConvolutionKernel_LC_Img(
364 basisKernelsListGPU, kernelN,
372 cudaThreadSynchronize();
373 if (cudaGetLastError() != cudaSuccess) {
374 throw LSST_EXCEPT(GpuRuntimeError,
"GPU calculation failed to run");
377 CopyFromGpu(outImage.
img, outImageGPU.ptr, outImage.
Size() );
383 template <
typename OutPixelT,
typename InPixelT>
384 void GPU_ConvolutionImage_LinearCombinationKernel(
386 vector<double> colPos,
387 vector<double> rowPos,
388 std::vector< afwMath::Kernel::SpatialFunctionPtr > sFn,
395 assert(basisKernels.size() == sFn.size());
397 int outWidth = outImage.
width;
398 int outHeight = outImage.
height;
400 const int kernelN = sFn.size();
401 const int kernelW = basisKernels[0].width;
402 const int kernelH = basisKernels[0].height;
403 const int kernelSize = kernelW * kernelH;
405 for (
int i = 0; i < kernelN; i++) {
406 assert(kernelW == basisKernels[i].width);
407 assert(kernelH == basisKernels[i].height);
411 GpuMemOwner<KerPixel > basisKernelsGPU;
412 basisKernelsGPU.Alloc(kernelSize * kernelN);
414 for (
int i = 0; i < kernelN; i++) {
415 KerPixel* kernelBeg = basisKernelsGPU.ptr + (kernelSize * i);
423 vector< double* > sFnValGPUPtr(kernelN);
424 vector< GpuMemOwner<double > > sFnValGPU_Owner(kernelN);
426 for (
int i = 0; i < kernelN; i++) {
427 sFnValGPUPtr[i] = sFnValGPU_Owner[i].Alloc(outWidth * outHeight);
428 if (sFnValGPUPtr[i] == NULL) {
429 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU for spatial function values");
432 GpuMemOwner<double*> sFnValGPU;
433 sFnValGPU.TransferVec(sFnValGPUPtr);
435 GpuMemOwner<double > normGPU_Owner;
436 vector<double> basisKernelSums(kernelN);
439 normGPU_Owner.Alloc(outWidth * outHeight);
440 if (normGPU_Owner.ptr == NULL) {
441 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU for normalization coefficients");
445 basisKernelSums = SumsOfImages<double, KerPixel>(basisKernels);
448 GPU_ConvolutionImage_LC_Img(
465 #define INSTANTIATE_GPU_ConvolutionImage_LinearCombinationKernel(OutPixelT,InPixelT) \
466 template void GPU_ConvolutionImage_LinearCombinationKernel<OutPixelT,InPixelT>( \
467 GpuBuffer2D<InPixelT>& inImage, \
468 vector<double> colPos, \
469 vector<double> rowPos, \
470 std::vector< afwMath::Kernel::SpatialFunctionPtr > sFn, \
471 GpuBuffer2D<OutPixelT>& outImage, \
472 std::vector< GpuBuffer2D<KerPixel> >& basisKernels, \
473 SpatialFunctionType_t sfType, \
477 template <
typename OutPixelT,
typename InPixelT>
478 void GPU_ConvolutionMI_LinearCombinationKernel(
482 vector<double> colPos,
483 vector<double> rowPos,
484 std::vector< afwMath::Kernel::SpatialFunctionPtr > sFn,
493 assert(basisKernels.size() == sFn.size());
494 assert(outImageImg.
width == outImageVar.
width);
495 assert(outImageImg.
width == outImageMsk.
width);
499 int outWidth = outImageImg.
width;
500 int outHeight = outImageImg.
height;
502 const int kernelN = sFn.size();
503 const int kernelW = basisKernels[0].width;
504 const int kernelH = basisKernels[0].height;
505 const int kernelSize = kernelW * kernelH;
507 for (
int i = 0; i < kernelN; i++) {
508 assert(kernelW == basisKernels[i].width);
509 assert(kernelH == basisKernels[i].height);
513 GpuMemOwner<KerPixel > basisKernelsGPU;
514 basisKernelsGPU.Alloc(kernelSize * kernelN);
516 for (
int i = 0; i < kernelN; i++) {
517 KerPixel* kernelBeg = basisKernelsGPU.ptr + (kernelSize * i);
525 vector< double* > sFnValGPUPtr(kernelN);
526 vector< GpuMemOwner<double > > sFnValGPU_Owner(kernelN);
528 for (
int i = 0; i < kernelN; i++) {
529 sFnValGPUPtr[i] = sFnValGPU_Owner[i].Alloc(outWidth * outHeight);
530 if (sFnValGPUPtr[i] == NULL) {
531 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU for spatial function values");
534 GpuMemOwner<double*> sFnValGPU;
535 sFnValGPU.TransferVec(sFnValGPUPtr);
538 GpuMemOwner<double > normGPU_Owner;
539 std::vector<KerPixel> basisKernelSums(kernelN);
542 normGPU_Owner.Alloc(outWidth * outHeight);
543 if (normGPU_Owner.ptr == NULL) {
544 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU for normalization coefficients");
547 basisKernelSums = SumsOfImages<double, KerPixel>(basisKernels);
550 GPU_ConvolutionImage_LC_Img(
566 GpuMemOwner<VarPixel> inImageGPUVar;
567 inImageGPUVar.Transfer(inImageVar);
568 if (inImageGPUVar.ptr == NULL) {
569 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU for input variance");
571 GpuMemOwner<MskPixel> inImageGPUMsk;
572 inImageGPUMsk.Transfer(inImageMsk);
573 if (inImageGPUMsk.ptr == NULL) {
574 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU for input mask");
578 GpuMemOwner<VarPixel > outImageGPUVar;
579 outImageGPUVar.Alloc( outImageVar.
Size());
580 if (outImageGPUVar.ptr == NULL) {
581 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU for output variance");
583 GpuMemOwner<MskPixel > outImageGPUMsk;
584 outImageGPUMsk.Alloc( outImageMsk.
Size());
585 if (outImageGPUMsk.ptr == NULL) {
586 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU for output mask");
592 mathDetailGpu::Call_ConvolutionKernel_LC_Var(
593 inImageGPUVar.ptr, inImageVar.
width, inImageVar.
height,
595 basisKernelsGPU.ptr, kernelN,
604 cudaThreadSynchronize();
605 if (cudaGetLastError() != cudaSuccess)
606 throw LSST_EXCEPT(GpuRuntimeError,
"GPU calculation failed to run");
608 CopyFromGpu(outImageVar.
img, outImageGPUVar.ptr, outImageVar.
Size() );
609 CopyFromGpu(outImageMsk.
img, outImageGPUMsk.ptr, outImageMsk.
Size() );
612 #define INSTANTIATE_GPU_ConvolutionMI_LinearCombinationKernel(OutPixelT,InPixelT) \
613 template void GPU_ConvolutionMI_LinearCombinationKernel<OutPixelT,InPixelT>( \
614 GpuBuffer2D<InPixelT>& inImageImg, \
615 GpuBuffer2D<VarPixel>& inImageVar, \
616 GpuBuffer2D<MskPixel>& inImageMsk, \
617 vector<double> colPos, \
618 vector<double> rowPos, \
619 std::vector< afwMath::Kernel::SpatialFunctionPtr > sFn, \
620 GpuBuffer2D<OutPixelT>& outImageImg, \
621 GpuBuffer2D<VarPixel>& outImageVar, \
622 GpuBuffer2D<MskPixel>& outImageMsk, \
623 std::vector< GpuBuffer2D<KerPixel> >& basisKernels, \
624 SpatialFunctionType_t sfType, \
629 template <
typename OutPixelT,
typename InPixelT>
630 void GPU_ConvolutionImage_SpatiallyInvariantKernel(
636 int kernelW = kernel.
width;
637 int kernelH = kernel.
height;
639 GpuMemOwner<InPixelT> inImageGPU;
640 inImageGPU.Transfer(inImage);
641 if (inImageGPU.ptr == NULL) {
642 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU for input image");
647 GpuMemOwner<KerPixel > basisKernelGPU;
648 basisKernelGPU.Transfer(kernel);
649 if (basisKernelGPU.ptr == NULL) {
650 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU available for kernel");
653 vector< OutPixelT* > outImageGPUPtr(1);
654 vector< GpuMemOwner<OutPixelT> > outImageGPU_Owner(1);
656 outImageGPUPtr[0] = outImageGPU_Owner[0].Alloc( outImage.
Size());
657 if (outImageGPUPtr[0] == NULL) {
658 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU available for output image");
660 GpuMemOwner<OutPixelT*> outImageGPU;
661 outImageGPU.TransferVec(outImageGPUPtr);
666 mathDetailGpu::Call_SpatiallyInvariantImageConvolutionKernel<OutPixelT, InPixelT>(
668 basisKernelGPU.ptr, 1,
674 cudaThreadSynchronize();
675 if (cudaGetLastError() != cudaSuccess) {
676 throw LSST_EXCEPT(GpuRuntimeError,
"GPU calculation failed to run");
678 CopyFromGpu(outImage.
img, outImageGPUPtr[0], outImage.
Size() );
681 #define INSTANTIATE_GPU_ConvolutionImage_SpatiallyInvariantKernel(OutPixelT,InPixelT) \
682 template void GPU_ConvolutionImage_SpatiallyInvariantKernel<OutPixelT,InPixelT>( \
683 GpuBuffer2D<InPixelT>& inImage, \
684 GpuBuffer2D<OutPixelT>& outImage, \
685 GpuBuffer2D<KerPixel>& kernel \
688 template <
typename OutPixelT,
typename InPixelT>
689 void GPU_ConvolutionMI_SpatiallyInvariantKernel(
699 int kernelW = kernel.
width;
700 int kernelH = kernel.
height;
702 GpuMemOwner<InPixelT> inImageGPUImg;
703 inImageGPUImg.Transfer(inImageImg);
704 if (inImageGPUImg.ptr == NULL) {
705 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU available for input image");
707 GpuMemOwner<VarPixel> inImageGPUVar;
708 inImageGPUVar.Transfer(inImageVar);
709 if (inImageGPUVar.ptr == NULL) {
710 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU available for input variance");
712 GpuMemOwner<MskPixel> inImageGPUMsk;
713 inImageGPUMsk.Transfer(inImageMsk);
714 if (inImageGPUMsk.ptr == NULL) {
715 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU available for input mask");
720 GpuMemOwner<KerPixel > basisKernelGPU;
721 basisKernelGPU.Transfer(kernel);
722 if (basisKernelGPU.ptr == NULL)
723 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU available for kernel");
726 vector< OutPixelT* > outImageGPUPtrImg(1);
727 vector< VarPixel* > outImageGPUPtrVar(1);
728 vector< MskPixel* > outImageGPUPtrMsk(1);
730 vector< GpuMemOwner<OutPixelT> > outImageGPU_OwnerImg(1);
731 vector< GpuMemOwner<VarPixel > > outImageGPU_OwnerVar(1);
732 vector< GpuMemOwner<MskPixel > > outImageGPU_OwnerMsk(1);
734 outImageGPUPtrImg[0] = outImageGPU_OwnerImg[0].Alloc( outImageImg.
Size());
735 if (outImageGPUPtrImg[0] == NULL) {
736 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU available for output image");
738 outImageGPUPtrVar[0] = outImageGPU_OwnerVar[0].Alloc( outImageVar.
Size());
739 if (outImageGPUPtrVar[0] == NULL) {
740 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU available for output variance");
742 outImageGPUPtrMsk[0] = outImageGPU_OwnerMsk[0].Alloc( outImageMsk.
Size());
743 if (outImageGPUPtrMsk[0] == NULL) {
744 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU available for output mask");
747 GpuMemOwner<OutPixelT*> outImageGPUImg;
748 outImageGPUImg.TransferVec(outImageGPUPtrImg);
749 GpuMemOwner<VarPixel*> outImageGPUVar;
750 outImageGPUVar.TransferVec(outImageGPUPtrVar);
751 GpuMemOwner<MskPixel*> outImageGPUMsk;
752 outImageGPUMsk.TransferVec(outImageGPUPtrMsk);
756 mathDetailGpu::Call_SpatiallyInvariantImageConvolutionKernel<OutPixelT, InPixelT>(
757 inImageGPUImg.ptr, inImageImg.
width, inImageImg.
height,
758 basisKernelGPU.ptr, 1,
765 for (
int y = 0;
y < kernelH;
y++) {
766 for (
int x = 0;
x < kernelW;
x++) {
771 CopyFromGpu(outImageImg.
img, outImageGPUPtrImg[0], outImageImg.
Size() );
773 basisKernelGPU.CopyToGpu(kernel);
777 mathDetailGpu::Call_SpatiallyInvariantImageConvolutionKernel<VarPixel, VarPixel>(
778 inImageGPUVar.ptr, inImageVar.
width, inImageVar.
height,
779 basisKernelGPU.ptr, 1,
786 cudaThreadSynchronize();
787 if (cudaGetLastError() != cudaSuccess) {
788 throw LSST_EXCEPT(GpuRuntimeError,
"GPU variance calculation failed to run");
790 mathDetailGpu::Call_SpatiallyInvariantMaskConvolutionKernel(
791 inImageGPUMsk.ptr, inImageMsk.
width, inImageMsk.
height,
792 basisKernelGPU.ptr, 1,
798 cudaThreadSynchronize();
799 if (cudaGetLastError() != cudaSuccess) {
800 throw LSST_EXCEPT(GpuRuntimeError,
"GPU mask calculation failed to run");
802 CopyFromGpu(outImageVar.
img, outImageGPUPtrVar[0], outImageVar.
Size() );
803 CopyFromGpu(outImageMsk.
img, outImageGPUPtrMsk[0], outImageMsk.
Size() );
806 #define INSTANTIATE_GPU_ConvolutionMI_SpatiallyInvariantKernel(OutPixelT,InPixelT) \
807 template void GPU_ConvolutionMI_SpatiallyInvariantKernel<OutPixelT,InPixelT>( \
808 GpuBuffer2D<InPixelT>& inImageImg, \
809 GpuBuffer2D<VarPixel>& inImageVar, \
810 GpuBuffer2D<MskPixel>& inImageMsk, \
811 GpuBuffer2D<OutPixelT>& outImageImg, \
812 GpuBuffer2D<VarPixel>& outImageVar, \
813 GpuBuffer2D<MskPixel>& outImageMsk, \
814 GpuBuffer2D<KerPixel>& kernel \
822 #define INSTANTIATE(OutPixelT,InPixelT) \
823 INSTANTIATE_GPU_ConvolutionImage_LinearCombinationKernel(OutPixelT,InPixelT) \
824 INSTANTIATE_GPU_ConvolutionMI_LinearCombinationKernel(OutPixelT,InPixelT) \
825 INSTANTIATE_GPU_ConvolutionImage_SpatiallyInvariantKernel(OutPixelT,InPixelT) \
826 INSTANTIATE_GPU_ConvolutionMI_SpatiallyInvariantKernel(OutPixelT,InPixelT)
2-dimensional weighted sum of Chebyshev polynomials of the first kind.
Declare the Kernel class and subclasses.
Class for representing an image or 2D array in general)
bool IsSufficientSharedMemoryAvailable_ForImgBlock(int filterW, int filterH, int pixSize)
2-dimensional polynomial function with cross terms
Define a collection of useful Functions.
contains GpuBuffer2D class (for simple handling of images or 2D arrays)
table::Key< table::Array< Kernel::Pixel > > image
int GetCudaCurSMSharedMemorySize()
returns shared memory size per block of currently selected cuda device
Set up for convolution, calls GPU convolution kernels.
lsst::afw::geom::Box2D getXYRange() const
Get x,y range.
int getOrder() const
Get the polynomial order.
void TestGpuKernel(int &ret1, int &ret2)
bool IsSufficientSharedMemoryAvailable_ForSfn(int order, int kernelN)
Functions to help managing setup for GPU kernels.
int GetCudaCurSMCount()
returns the number of streaming multiprocessors of currently selected cuda device ...
#define LSST_EXCEPT(type,...)
PixelT & Pixel(int x, int y)
lsst::afw::image::MaskPixel MskPixel
A floating-point coordinate rectangle geometry.
Functions to query the properties of currently selected GPU device.
Functions and a class to help allocating GPU global memory and transferring data to and from a GPU...
Definition of default types for Masks and Variance Images.
bool IsSufficientSharedMemoryAvailable_ForImgAndMaskBlock(int filterW, int filterH, int pixSize)