72 #include <cuda_runtime.h>
88 using namespace lsst::afw::gpu;
89 using namespace lsst::afw::gpu::detail;
91 namespace afwMath = lsst::afw::math;
92 namespace mathDetailGpu = lsst::afw::math::detail::gpu;
102 const int shMemBytesUsed = 200;
111 int bufferSize = (filterW + blockSizeX - 1) * (filterH + blockSizeY - 1) * pixSize;
113 return shMemSize - shMemBytesUsed - bufferSize > 0;
122 int imgBufferSize = (filterW + blockSizeX - 1) * (filterH + blockSizeY - 1) * pixSize;
123 int mskBufferSize = (filterW + blockSizeX - 1) * (filterH + blockSizeY - 1) *
sizeof(
MskPixel);
125 int memRemaining = shMemSize - shMemBytesUsed - imgBufferSize - mskBufferSize ;
127 return memRemaining > 0;
136 const int coeffN = order + 1;
137 const int coeffPadding = coeffN + 1 - (coeffN % 2);
138 int paramN = (order + 1) * (order + 2) / 2;
140 int yCoeffsAll = coeffPadding * blockSizeX *
sizeof(double);
141 int xChebyAll = coeffPadding * blockSizeX *
sizeof(double);
142 int smemParams = paramN *
sizeof(double);
144 int smemKernelSum = kernelN *
sizeof(double);
145 int smemSfnPtr = kernelN *
sizeof(
double*);
147 int memRemainingSfn = shMemSize - shMemBytesUsed - yCoeffsAll - xChebyAll - smemParams ;
148 int memRemainingNorm = shMemSize - shMemBytesUsed - smemKernelSum - smemSfnPtr;
150 return min(memRemainingSfn, memRemainingNorm) > 0;
156 int CalcBlockCount(
int multiprocCount)
158 if (multiprocCount < 12)
return multiprocCount * 4;
159 if (multiprocCount < 24)
return multiprocCount * 2;
160 return multiprocCount;
170 GpuMemOwner<int> resGpu;
173 gpu::CallTestGpuKernel(resGpu.ptr);
175 resGpu.CopyFromGpu(res);
184 template <
typename ResultT,
typename InT>
187 int n = int(images.size());
188 vector<ResultT>
sum(n);
189 for (
int i = 0; i < n; i++) {
190 ResultT totalSum = 0;
191 int h = images[i].height;
192 int w = images[i].width;
194 for (
int y = 0;
y < h;
y++) {
196 for (
int x = 0;
x <
w;
x++) {
197 rowSum += images[i].Pixel(
x,
y);
220 template <
typename OutPixelT,
typename InPixelT>
221 void GPU_ConvolutionImage_LC_Img(
223 const vector<double>& colPos,
224 const vector<double>& rowPos,
225 const std::vector< afwMath::Kernel::SpatialFunctionPtr >& sFn,
226 const vector<double*>& sFnValGPUPtr,
231 int kernelW,
int kernelH,
232 const vector<double>& basisKernelSums,
237 const int kernelN = sFn.size();
240 GpuMemOwner<InPixelT > inImageGPU;
241 inImageGPU.Transfer(inImage);
242 if (inImageGPU.ptr == NULL) {
243 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU for input image");
246 GpuMemOwner<OutPixelT> outImageGPU;
247 outImageGPU.Alloc( outImage.
Size());
248 if (outImageGPU.ptr == NULL) {
249 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU for output image");
252 GpuMemOwner<double> colPosGPU_Owner;
253 colPosGPU_Owner.TransferVec(colPos);
254 if (colPosGPU_Owner.ptr == NULL) {
256 "Not enough memory on GPU for row coordinate tranformation data");
258 GpuMemOwner<double> rowPosGPU_Owner;
259 rowPosGPU_Owner.TransferVec(rowPos);
260 if (rowPosGPU_Owner.ptr == NULL) {
262 "Not enough memory on GPU for column coordinate tranformation data");
264 vector< double* > sFnParamsGPUPtr(kernelN);
265 vector< GpuMemOwner<double> > sFnParamsGPU_Owner(kernelN);
268 for (
int i = 0; i < kernelN; i++) {
269 std::vector<double> spatialParams = sFn[i]->getParameters();
270 sFnParamsGPUPtr[i] = sFnParamsGPU_Owner[i].TransferVec(spatialParams);
271 if (sFnParamsGPUPtr[i] == NULL) {
272 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU for spatial function parameters");
278 for (
int i = 0; i < kernelN; i++)
286 gpu::Call_PolynomialImageValues(
289 sFnParamsGPU_Owner[i].ptr,
296 cudaError_t cuda_err = cudaGetLastError();
297 if (cuda_err != cudaSuccess) {
298 throw LSST_EXCEPT(GpuRuntimeError,
"GPU calculation failed to run");
307 gpu::Call_ChebyshevImageValues(
310 sFnParamsGPU_Owner[i].ptr,
313 xyRange.getMinX(), xyRange.getMinY(), xyRange.getMaxX(), xyRange.getMaxY(),
318 cudaError_t cuda_err = cudaGetLastError();
319 if (cuda_err != cudaSuccess) {
320 throw LSST_EXCEPT(GpuRuntimeError,
"GPU calculation failed to run");
324 cudaThreadSynchronize();
325 cudaError_t cuda_err = cudaGetLastError();
326 if (cuda_err != cudaSuccess) {
327 throw LSST_EXCEPT(GpuRuntimeError,
"GPU calculation failed to run");
334 GpuMemOwner<double> basisKernelSumsGPU;
335 basisKernelSumsGPU.TransferVec(basisKernelSums);
337 bool isDivideByZero =
false;
338 GpuMemOwner<bool> isDivideByZeroGPU;
339 isDivideByZeroGPU.Transfer(&isDivideByZero, 1);
341 gpu::Call_NormalizationImageValues(
344 basisKernelSumsGPU.ptr,
345 isDivideByZeroGPU.ptr,
349 cudaThreadSynchronize();
350 if (cudaGetLastError() != cudaSuccess) {
351 throw LSST_EXCEPT(GpuRuntimeError,
"GPU calculation failed to run");
353 CopyFromGpu<bool>(&isDivideByZero, isDivideByZeroGPU.ptr, 1);
354 if (isDivideByZero) {
355 throw LSST_EXCEPT(pexExcept::OverflowError,
"Cannot normalize; kernel sum is 0");
360 mathDetailGpu::Call_ConvolutionKernel_LC_Img(
362 basisKernelsListGPU, kernelN,
370 cudaThreadSynchronize();
371 if (cudaGetLastError() != cudaSuccess) {
372 throw LSST_EXCEPT(GpuRuntimeError,
"GPU calculation failed to run");
375 CopyFromGpu(outImage.
img, outImageGPU.ptr, outImage.
Size() );
381 template <
typename OutPixelT,
typename InPixelT>
382 void GPU_ConvolutionImage_LinearCombinationKernel(
384 vector<double> colPos,
385 vector<double> rowPos,
386 std::vector< afwMath::Kernel::SpatialFunctionPtr > sFn,
393 assert(basisKernels.size() == sFn.size());
395 int outWidth = outImage.
width;
396 int outHeight = outImage.
height;
398 const int kernelN = sFn.size();
399 const int kernelW = basisKernels[0].width;
400 const int kernelH = basisKernels[0].height;
401 const int kernelSize = kernelW * kernelH;
403 for (
int i = 0; i < kernelN; i++) {
404 assert(kernelW == basisKernels[i].
width);
405 assert(kernelH == basisKernels[i].
height);
409 GpuMemOwner<KerPixel > basisKernelsGPU;
410 basisKernelsGPU.Alloc(kernelSize * kernelN);
412 for (
int i = 0; i < kernelN; i++) {
413 KerPixel* kernelBeg = basisKernelsGPU.ptr + (kernelSize * i);
421 vector< double* > sFnValGPUPtr(kernelN);
422 vector< GpuMemOwner<double > > sFnValGPU_Owner(kernelN);
424 for (
int i = 0; i < kernelN; i++) {
425 sFnValGPUPtr[i] = sFnValGPU_Owner[i].Alloc(outWidth * outHeight);
426 if (sFnValGPUPtr[i] == NULL) {
427 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU for spatial function values");
430 GpuMemOwner<double*> sFnValGPU;
431 sFnValGPU.TransferVec(sFnValGPUPtr);
433 GpuMemOwner<double > normGPU_Owner;
434 vector<double> basisKernelSums(kernelN);
437 normGPU_Owner.Alloc(outWidth * outHeight);
438 if (normGPU_Owner.ptr == NULL) {
439 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU for normalization coefficients");
443 basisKernelSums = SumsOfImages<double, KerPixel>(basisKernels);
446 GPU_ConvolutionImage_LC_Img(
463 #define INSTANTIATE_GPU_ConvolutionImage_LinearCombinationKernel(OutPixelT,InPixelT) \
464 template void GPU_ConvolutionImage_LinearCombinationKernel<OutPixelT,InPixelT>( \
465 GpuBuffer2D<InPixelT>& inImage, \
466 vector<double> colPos, \
467 vector<double> rowPos, \
468 std::vector< afwMath::Kernel::SpatialFunctionPtr > sFn, \
469 GpuBuffer2D<OutPixelT>& outImage, \
470 std::vector< GpuBuffer2D<KerPixel> >& basisKernels, \
471 SpatialFunctionType_t sfType, \
475 template <
typename OutPixelT,
typename InPixelT>
476 void GPU_ConvolutionMI_LinearCombinationKernel(
480 vector<double> colPos,
481 vector<double> rowPos,
482 std::vector< afwMath::Kernel::SpatialFunctionPtr > sFn,
491 assert(basisKernels.size() == sFn.size());
492 assert(outImageImg.
width == outImageVar.
width);
493 assert(outImageImg.
width == outImageMsk.
width);
497 int outWidth = outImageImg.
width;
498 int outHeight = outImageImg.
height;
500 const int kernelN = sFn.size();
501 const int kernelW = basisKernels[0].width;
502 const int kernelH = basisKernels[0].height;
503 const int kernelSize = kernelW * kernelH;
505 for (
int i = 0; i < kernelN; i++) {
506 assert(kernelW == basisKernels[i].
width);
507 assert(kernelH == basisKernels[i].
height);
511 GpuMemOwner<KerPixel > basisKernelsGPU;
512 basisKernelsGPU.Alloc(kernelSize * kernelN);
514 for (
int i = 0; i < kernelN; i++) {
515 KerPixel* kernelBeg = basisKernelsGPU.ptr + (kernelSize * i);
523 vector< double* > sFnValGPUPtr(kernelN);
524 vector< GpuMemOwner<double > > sFnValGPU_Owner(kernelN);
526 for (
int i = 0; i < kernelN; i++) {
527 sFnValGPUPtr[i] = sFnValGPU_Owner[i].Alloc(outWidth * outHeight);
528 if (sFnValGPUPtr[i] == NULL) {
529 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU for spatial function values");
532 GpuMemOwner<double*> sFnValGPU;
533 sFnValGPU.TransferVec(sFnValGPUPtr);
536 GpuMemOwner<double > normGPU_Owner;
537 std::vector<KerPixel> basisKernelSums(kernelN);
540 normGPU_Owner.Alloc(outWidth * outHeight);
541 if (normGPU_Owner.ptr == NULL) {
542 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU for normalization coefficients");
545 basisKernelSums = SumsOfImages<double, KerPixel>(basisKernels);
548 GPU_ConvolutionImage_LC_Img(
564 GpuMemOwner<VarPixel> inImageGPUVar;
565 inImageGPUVar.Transfer(inImageVar);
566 if (inImageGPUVar.ptr == NULL) {
567 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU for input variance");
569 GpuMemOwner<MskPixel> inImageGPUMsk;
570 inImageGPUMsk.Transfer(inImageMsk);
571 if (inImageGPUMsk.ptr == NULL) {
572 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU for input mask");
576 GpuMemOwner<VarPixel > outImageGPUVar;
577 outImageGPUVar.Alloc( outImageVar.
Size());
578 if (outImageGPUVar.ptr == NULL) {
579 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU for output variance");
581 GpuMemOwner<MskPixel > outImageGPUMsk;
582 outImageGPUMsk.Alloc( outImageMsk.
Size());
583 if (outImageGPUMsk.ptr == NULL) {
584 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU for output mask");
590 mathDetailGpu::Call_ConvolutionKernel_LC_Var(
591 inImageGPUVar.ptr, inImageVar.
width, inImageVar.
height,
593 basisKernelsGPU.ptr, kernelN,
602 cudaThreadSynchronize();
603 if (cudaGetLastError() != cudaSuccess)
604 throw LSST_EXCEPT(GpuRuntimeError,
"GPU calculation failed to run");
606 CopyFromGpu(outImageVar.
img, outImageGPUVar.ptr, outImageVar.
Size() );
607 CopyFromGpu(outImageMsk.
img, outImageGPUMsk.ptr, outImageMsk.
Size() );
610 #define INSTANTIATE_GPU_ConvolutionMI_LinearCombinationKernel(OutPixelT,InPixelT) \
611 template void GPU_ConvolutionMI_LinearCombinationKernel<OutPixelT,InPixelT>( \
612 GpuBuffer2D<InPixelT>& inImageImg, \
613 GpuBuffer2D<VarPixel>& inImageVar, \
614 GpuBuffer2D<MskPixel>& inImageMsk, \
615 vector<double> colPos, \
616 vector<double> rowPos, \
617 std::vector< afwMath::Kernel::SpatialFunctionPtr > sFn, \
618 GpuBuffer2D<OutPixelT>& outImageImg, \
619 GpuBuffer2D<VarPixel>& outImageVar, \
620 GpuBuffer2D<MskPixel>& outImageMsk, \
621 std::vector< GpuBuffer2D<KerPixel> >& basisKernels, \
622 SpatialFunctionType_t sfType, \
627 template <
typename OutPixelT,
typename InPixelT>
628 void GPU_ConvolutionImage_SpatiallyInvariantKernel(
634 int kernelW = kernel.
width;
635 int kernelH = kernel.
height;
637 GpuMemOwner<InPixelT> inImageGPU;
638 inImageGPU.Transfer(inImage);
639 if (inImageGPU.ptr == NULL) {
640 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU for input image");
645 GpuMemOwner<KerPixel > basisKernelGPU;
646 basisKernelGPU.Transfer(kernel);
647 if (basisKernelGPU.ptr == NULL) {
648 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU available for kernel");
651 vector< OutPixelT* > outImageGPUPtr(1);
652 vector< GpuMemOwner<OutPixelT> > outImageGPU_Owner(1);
654 outImageGPUPtr[0] = outImageGPU_Owner[0].Alloc( outImage.
Size());
655 if (outImageGPUPtr[0] == NULL) {
656 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU available for output image");
658 GpuMemOwner<OutPixelT*> outImageGPU;
659 outImageGPU.TransferVec(outImageGPUPtr);
664 mathDetailGpu::Call_SpatiallyInvariantImageConvolutionKernel<OutPixelT, InPixelT>(
666 basisKernelGPU.ptr, 1,
672 cudaThreadSynchronize();
673 if (cudaGetLastError() != cudaSuccess) {
674 throw LSST_EXCEPT(GpuRuntimeError,
"GPU calculation failed to run");
676 CopyFromGpu(outImage.
img, outImageGPUPtr[0], outImage.
Size() );
679 #define INSTANTIATE_GPU_ConvolutionImage_SpatiallyInvariantKernel(OutPixelT,InPixelT) \
680 template void GPU_ConvolutionImage_SpatiallyInvariantKernel<OutPixelT,InPixelT>( \
681 GpuBuffer2D<InPixelT>& inImage, \
682 GpuBuffer2D<OutPixelT>& outImage, \
683 GpuBuffer2D<KerPixel>& kernel \
686 template <
typename OutPixelT,
typename InPixelT>
687 void GPU_ConvolutionMI_SpatiallyInvariantKernel(
697 int kernelW = kernel.
width;
698 int kernelH = kernel.
height;
700 GpuMemOwner<InPixelT> inImageGPUImg;
701 inImageGPUImg.Transfer(inImageImg);
702 if (inImageGPUImg.ptr == NULL) {
703 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU available for input image");
705 GpuMemOwner<VarPixel> inImageGPUVar;
706 inImageGPUVar.Transfer(inImageVar);
707 if (inImageGPUVar.ptr == NULL) {
708 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU available for input variance");
710 GpuMemOwner<MskPixel> inImageGPUMsk;
711 inImageGPUMsk.Transfer(inImageMsk);
712 if (inImageGPUMsk.ptr == NULL) {
713 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU available for input mask");
718 GpuMemOwner<KerPixel > basisKernelGPU;
719 basisKernelGPU.Transfer(kernel);
720 if (basisKernelGPU.ptr == NULL)
721 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU available for kernel");
724 vector< OutPixelT* > outImageGPUPtrImg(1);
725 vector< VarPixel* > outImageGPUPtrVar(1);
726 vector< MskPixel* > outImageGPUPtrMsk(1);
728 vector< GpuMemOwner<OutPixelT> > outImageGPU_OwnerImg(1);
729 vector< GpuMemOwner<VarPixel > > outImageGPU_OwnerVar(1);
730 vector< GpuMemOwner<MskPixel > > outImageGPU_OwnerMsk(1);
732 outImageGPUPtrImg[0] = outImageGPU_OwnerImg[0].Alloc( outImageImg.
Size());
733 if (outImageGPUPtrImg[0] == NULL) {
734 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU available for output image");
736 outImageGPUPtrVar[0] = outImageGPU_OwnerVar[0].Alloc( outImageVar.
Size());
737 if (outImageGPUPtrVar[0] == NULL) {
738 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU available for output variance");
740 outImageGPUPtrMsk[0] = outImageGPU_OwnerMsk[0].Alloc( outImageMsk.
Size());
741 if (outImageGPUPtrMsk[0] == NULL) {
742 throw LSST_EXCEPT(GpuMemoryError,
"Not enough memory on GPU available for output mask");
745 GpuMemOwner<OutPixelT*> outImageGPUImg;
746 outImageGPUImg.TransferVec(outImageGPUPtrImg);
747 GpuMemOwner<VarPixel*> outImageGPUVar;
748 outImageGPUVar.TransferVec(outImageGPUPtrVar);
749 GpuMemOwner<MskPixel*> outImageGPUMsk;
750 outImageGPUMsk.TransferVec(outImageGPUPtrMsk);
754 mathDetailGpu::Call_SpatiallyInvariantImageConvolutionKernel<OutPixelT, InPixelT>(
755 inImageGPUImg.ptr, inImageImg.
width, inImageImg.
height,
756 basisKernelGPU.ptr, 1,
763 for (
int y = 0;
y < kernelH;
y++) {
764 for (
int x = 0;
x < kernelW;
x++) {
769 CopyFromGpu(outImageImg.
img, outImageGPUPtrImg[0], outImageImg.
Size() );
771 basisKernelGPU.CopyToGpu(kernel);
775 mathDetailGpu::Call_SpatiallyInvariantImageConvolutionKernel<VarPixel, VarPixel>(
776 inImageGPUVar.ptr, inImageVar.
width, inImageVar.
height,
777 basisKernelGPU.ptr, 1,
784 cudaThreadSynchronize();
785 if (cudaGetLastError() != cudaSuccess) {
786 throw LSST_EXCEPT(GpuRuntimeError,
"GPU variance calculation failed to run");
788 mathDetailGpu::Call_SpatiallyInvariantMaskConvolutionKernel(
789 inImageGPUMsk.ptr, inImageMsk.
width, inImageMsk.
height,
790 basisKernelGPU.ptr, 1,
796 cudaThreadSynchronize();
797 if (cudaGetLastError() != cudaSuccess) {
798 throw LSST_EXCEPT(GpuRuntimeError,
"GPU mask calculation failed to run");
800 CopyFromGpu(outImageVar.
img, outImageGPUPtrVar[0], outImageVar.
Size() );
801 CopyFromGpu(outImageMsk.
img, outImageGPUPtrMsk[0], outImageMsk.
Size() );
804 #define INSTANTIATE_GPU_ConvolutionMI_SpatiallyInvariantKernel(OutPixelT,InPixelT) \
805 template void GPU_ConvolutionMI_SpatiallyInvariantKernel<OutPixelT,InPixelT>( \
806 GpuBuffer2D<InPixelT>& inImageImg, \
807 GpuBuffer2D<VarPixel>& inImageVar, \
808 GpuBuffer2D<MskPixel>& inImageMsk, \
809 GpuBuffer2D<OutPixelT>& outImageImg, \
810 GpuBuffer2D<VarPixel>& outImageVar, \
811 GpuBuffer2D<MskPixel>& outImageMsk, \
812 GpuBuffer2D<KerPixel>& kernel \
820 #define INSTANTIATE(OutPixelT,InPixelT) \
821 INSTANTIATE_GPU_ConvolutionImage_LinearCombinationKernel(OutPixelT,InPixelT) \
822 INSTANTIATE_GPU_ConvolutionMI_LinearCombinationKernel(OutPixelT,InPixelT) \
823 INSTANTIATE_GPU_ConvolutionImage_SpatiallyInvariantKernel(OutPixelT,InPixelT) \
824 INSTANTIATE_GPU_ConvolutionMI_SpatiallyInvariantKernel(OutPixelT,InPixelT)
2-dimensional weighted sum of Chebyshev polynomials of the first kind.
Declare the Kernel class and subclasses.
Class for representing an image or 2D array in general)
bool IsSufficientSharedMemoryAvailable_ForImgBlock(int filterW, int filterH, int pixSize)
2-dimensional polynomial function with cross terms
#define INSTANTIATE(MATCH)
Define a collection of useful Functions.
contains GpuBuffer2D class (for simple handling of images or 2D arrays)
boost::enable_if< typename ExpressionTraits< Scalar >::IsScalar, Scalar >::type sum(Scalar const &scalar)
table::Key< table::Array< Kernel::Pixel > > image
int GetCudaCurSMSharedMemorySize()
returns shared memory size per block of currently selected cuda device
Set up for convolution, calls GPU convolution kernels.
void ImageT ImageT int float saturatedPixelValue int const width
lsst::afw::geom::Box2D getXYRange() const
Get x,y range.
int getOrder() const
Get the polynomial order.
void TestGpuKernel(int &ret1, int &ret2)
bool IsSufficientSharedMemoryAvailable_ForSfn(int order, int kernelN)
void ImageT ImageT int float saturatedPixelValue int const height
Functions to help managing setup for GPU kernels.
int GetCudaCurSMCount()
returns the number of streaming multiprocessors of currently selected cuda device ...
#define LSST_EXCEPT(type,...)
PixelT & Pixel(int x, int y)
lsst::afw::image::MaskPixel MskPixel
A floating-point coordinate rectangle geometry.
Functions to query the properties of currently selected GPU device.
Functions and a class to help allocating GPU global memory and transferring data to and from a GPU...
Definition of default types for Masks and Variance Images.
bool IsSufficientSharedMemoryAvailable_ForImgAndMaskBlock(int filterW, int filterH, int pixSize)