LSSTApplications  11.0-13-gbb96280,12.1.rc1,12.1.rc1+1,12.1.rc1+2,12.1.rc1+5,12.1.rc1+8,12.1.rc1-1-g06d7636+1,12.1.rc1-1-g253890b+5,12.1.rc1-1-g3d31b68+7,12.1.rc1-1-g3db6b75+1,12.1.rc1-1-g5c1385a+3,12.1.rc1-1-g83b2247,12.1.rc1-1-g90cb4cf+6,12.1.rc1-1-g91da24b+3,12.1.rc1-2-g3521f8a,12.1.rc1-2-g39433dd+4,12.1.rc1-2-g486411b+2,12.1.rc1-2-g4c2be76,12.1.rc1-2-gc9c0491,12.1.rc1-2-gda2cd4f+6,12.1.rc1-3-g3391c73+2,12.1.rc1-3-g8c1bd6c+1,12.1.rc1-3-gcf4b6cb+2,12.1.rc1-4-g057223e+1,12.1.rc1-4-g19ed13b+2,12.1.rc1-4-g30492a7
LSSTDataManagementBasePackage
cudaConvWrapper.cc
Go to the documentation of this file.
1 // -*- LSST-C++ -*-
2 
3 /*
4  * LSST Data Management System
5  * Copyright 2008, 2009, 2010 LSST Corporation.
6  *
7  * This product includes software developed by the
8  * LSST Project (http://www.lsst.org/).
9  *
10  * This program is free software: you can redistribute it and/or modify
11  * it under the terms of the GNU General Public License as published by
12  * the Free Software Foundation, either version 3 of the License, or
13  * (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18  * GNU General Public License for more details.
19  *
20  * You should have received a copy of the LSST License Statement and
21  * the GNU General Public License along with this program. If not,
22  * see <http://www.lsstcorp.org/LegalNotices/>.
23  */
24 
38 #ifndef GPU_BUILD
39 
40 #include <cstdint>
41 
42 namespace lsst {
43 namespace afw {
44 namespace math {
45 namespace detail {
46 
47 
48 void TestGpuKernel(int& ret1, int& ret2) {
49  ret1 = 0;
50  ret2 = 0;
51 }
52 
53 bool IsSufficientSharedMemoryAvailable_ForImgBlock(int filterW, int filterH, int pixSize)
54 {
55  return false;
56 }
57 bool IsSufficientSharedMemoryAvailable_ForImgAndMaskBlock(int filterW, int filterH, int pixSize)
58 {
59  return false;
60 }
61 bool IsSufficientSharedMemoryAvailable_ForSfn(int order, int kernelN)
62 {
63  return false;
64 }
65 
66 }
67 }
68 }
69 }
70 
71 #else
72 
73 #include <cuda.h>
74 #include <cuda_runtime.h>
75 #include <stdio.h>
76 
78 #include "lsst/afw/math/Kernel.h"
80 
88 
89 using namespace std;
90 using namespace lsst::afw::gpu;
91 using namespace lsst::afw::gpu::detail;
92 namespace afwImage = lsst::afw::image;
93 namespace afwMath = lsst::afw::math;
94 namespace mathDetailGpu = lsst::afw::math::detail::gpu;
95 
96 
97 
98 namespace lsst {
99 namespace afw {
100 namespace math {
101 namespace detail {
102 
103 namespace {
104 const int shMemBytesUsed = 200;
105 }
106 
107 
108 // Returns true if there is sufficient shared memory for loading an image block,
109 // where image block includes including filter frame.
110 bool IsSufficientSharedMemoryAvailable_ForImgBlock(int filterW, int filterH, int pixSize)
111 {
112  int shMemSize = GetCudaCurSMSharedMemorySize();
113  int bufferSize = (filterW + blockSizeX - 1) * (filterH + blockSizeY - 1) * pixSize;
114 
115  return shMemSize - shMemBytesUsed - bufferSize > 0;
116 }
117 
118 // Returns true if there is sufficient shared memory for loading an image block,
119 // and acommpanying block of mask data (mask image block),
120 // where image block and mask image block include including filter frame.
121 bool IsSufficientSharedMemoryAvailable_ForImgAndMaskBlock(int filterW, int filterH, int pixSize)
122 {
123  int shMemSize = GetCudaCurSMSharedMemorySize();
124  int imgBufferSize = (filterW + blockSizeX - 1) * (filterH + blockSizeY - 1) * pixSize;
125  int mskBufferSize = (filterW + blockSizeX - 1) * (filterH + blockSizeY - 1) * sizeof(MskPixel);
126 
127  int memRemaining = shMemSize - shMemBytesUsed - imgBufferSize - mskBufferSize ;
128 
129  return memRemaining > 0;
130 }
131 
132 // Returns true if there is sufficient shared memory for loading
133 // parameters and temporary values for Chebyshev function and normalization
134 bool IsSufficientSharedMemoryAvailable_ForSfn(int order, int kernelN)
135 {
136  int shMemSize = GetCudaCurSMSharedMemorySize();
137 
138  const int coeffN = order + 1;
139  const int coeffPadding = coeffN + 1 - (coeffN % 2);
140  int paramN = (order + 1) * (order + 2) / 2;
141 
142  int yCoeffsAll = coeffPadding * blockSizeX * sizeof(double);
143  int xChebyAll = coeffPadding * blockSizeX * sizeof(double);
144  int smemParams = paramN * sizeof(double);
145 
146  int smemKernelSum = kernelN * sizeof(double);
147  int smemSfnPtr = kernelN * sizeof(double*);
148 
149  int memRemainingSfn = shMemSize - shMemBytesUsed - yCoeffsAll - xChebyAll - smemParams ;
150  int memRemainingNorm = shMemSize - shMemBytesUsed - smemKernelSum - smemSfnPtr;
151 
152  return min(memRemainingSfn, memRemainingNorm) > 0;
153 }
154 
155 // This function decides on the best GPU block count
156 // uses simple heuristics (not to much blocks and not too many)
157 // but guarantees that number of blocks will be a multiple of number of multiprocessors
158 int CalcBlockCount(int multiprocCount)
159 {
160  if (multiprocCount < 12) return multiprocCount * 4;
161  if (multiprocCount < 24) return multiprocCount * 2;
162  return multiprocCount;
163 }
164 
165 
166 // calls test gpu kernel
167 // should return 5 and 8 in ret1 and ret2
168 void TestGpuKernel(int& ret1, int& ret2)
169 {
170  int res[2];
171 
172  GpuMemOwner<int> resGpu;
173  resGpu.Alloc(2);
174 
175  gpu::CallTestGpuKernel(resGpu.ptr);
176 
177  resGpu.CopyFromGpu(res);
178 
179  ret1 = res[0];
180  ret2 = res[1];
181 }
182 
183 namespace {
184 
185 //calculates sum of each image in 'images' vector
186 template <typename ResultT, typename InT>
187 vector<ResultT> SumsOfImages(const vector< GpuBuffer2D<InT> >& images)
188 {
189  int n = int(images.size());
190  vector<ResultT> sum(n);
191  for (int i = 0; i < n; i++) {
192  ResultT totalSum = 0;
193  int h = images[i].height;
194  int w = images[i].width;
195 
196  for (int y = 0; y < h; y++) {
197  ResultT rowSum = 0;
198  for (int x = 0; x < w; x++) {
199  rowSum += images[i].Pixel(x, y);
200  }
201  totalSum += rowSum;
202  }
203  sum[i] = totalSum;
204  }
205  return sum;
206 }
207 
222 template <typename OutPixelT, typename InPixelT>
223 void GPU_ConvolutionImage_LC_Img(
224  const GpuBuffer2D<InPixelT>& inImage,
225  const vector<double>& colPos,
226  const vector<double>& rowPos,
227  const std::vector< afwMath::Kernel::SpatialFunctionPtr >& sFn,
228  const vector<double*>& sFnValGPUPtr, //output
229  double** sFnValGPU, //output
230  SpatialFunctionType_t sfType,
231  GpuBuffer2D<OutPixelT>& outImage, //output
232  KerPixel* basisKernelsListGPU,
233  int kernelW, int kernelH,
234  const vector<double>& basisKernelSums, //input
235  double* normGPU, //output
236  bool doNormalize
237 )
238 {
239  const int kernelN = sFn.size();
240 
241  //transfer input image
242  GpuMemOwner<InPixelT > inImageGPU;
243  inImageGPU.Transfer(inImage);
244  if (inImageGPU.ptr == NULL) {
245  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU for input image");
246  }
247  // allocate output image planes on GPU
248  GpuMemOwner<OutPixelT> outImageGPU;
249  outImageGPU.Alloc( outImage.Size());
250  if (outImageGPU.ptr == NULL) {
251  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU for output image");
252  }
253  //transfer coordinate tranform data
254  GpuMemOwner<double> colPosGPU_Owner;
255  colPosGPU_Owner.TransferVec(colPos);
256  if (colPosGPU_Owner.ptr == NULL) {
257  throw LSST_EXCEPT(GpuMemoryError,
258  "Not enough memory on GPU for row coordinate tranformation data");
259  }
260  GpuMemOwner<double> rowPosGPU_Owner;
261  rowPosGPU_Owner.TransferVec(rowPos);
262  if (rowPosGPU_Owner.ptr == NULL) {
263  throw LSST_EXCEPT(GpuMemoryError,
264  "Not enough memory on GPU for column coordinate tranformation data");
265  }
266  vector< double* > sFnParamsGPUPtr(kernelN);
267  vector< GpuMemOwner<double> > sFnParamsGPU_Owner(kernelN);
268 
269  //transfer sfn parameters to GPU
270  for (int i = 0; i < kernelN; i++) {
271  std::vector<double> spatialParams = sFn[i]->getParameters();
272  sFnParamsGPUPtr[i] = sFnParamsGPU_Owner[i].TransferVec(spatialParams);
273  if (sFnParamsGPUPtr[i] == NULL) {
274  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU for spatial function parameters");
275  }
276  }
277 
278  int shMemSize = GetCudaCurSMSharedMemorySize() - shMemBytesUsed;
279 
280  for (int i = 0; i < kernelN; i++)
281  {
282  cudaGetLastError(); //clear error status
283 
284  if (sfType == sftPolynomial) {
285  const afwMath::PolynomialFunction2<double>* polySfn =
286  dynamic_cast<const afwMath::PolynomialFunction2<double>*>( sFn[i].get() );
287 
288  gpu::Call_PolynomialImageValues(
289  sFnValGPUPtr[i], outImage.width, outImage.height,
290  polySfn->getOrder(),
291  sFnParamsGPU_Owner[i].ptr,
292  rowPosGPU_Owner.ptr,
293  colPosGPU_Owner.ptr,
294  shMemSize
295  );
296 
297  //cudaThreadSynchronize();
298  cudaError_t cuda_err = cudaGetLastError();
299  if (cuda_err != cudaSuccess) {
300  throw LSST_EXCEPT(GpuRuntimeError, "GPU calculation failed to run");
301  }
302  }
303  if (sfType == sftChebyshev) {
304  const afwMath::Chebyshev1Function2<double>* chebSfn =
305  dynamic_cast<const afwMath::Chebyshev1Function2<double>*>( sFn[i].get() );
306 
307  lsst::afw::geom::Box2D const xyRange = chebSfn->getXYRange();
308 
309  gpu::Call_ChebyshevImageValues(
310  sFnValGPUPtr[i], outImage.width, outImage.height,
311  chebSfn->getOrder(),
312  sFnParamsGPU_Owner[i].ptr,
313  rowPosGPU_Owner.ptr,
314  colPosGPU_Owner.ptr,
315  xyRange.getMinX(), xyRange.getMinY(), xyRange.getMaxX(), xyRange.getMaxY(),
316  shMemSize
317  );
318 
319  //cudaThreadSynchronize();
320  cudaError_t cuda_err = cudaGetLastError();
321  if (cuda_err != cudaSuccess) {
322  throw LSST_EXCEPT(GpuRuntimeError, "GPU calculation failed to run");
323  }
324  }
325  }
326  cudaThreadSynchronize();
327  cudaError_t cuda_err = cudaGetLastError();
328  if (cuda_err != cudaSuccess) {
329  throw LSST_EXCEPT(GpuRuntimeError, "GPU calculation failed to run");
330  }
331 
332  int blockN = CalcBlockCount( GetCudaCurSMCount());
333 
334  //transfer basis kernel sums
335  if (doNormalize) {
336  GpuMemOwner<double> basisKernelSumsGPU;
337  basisKernelSumsGPU.TransferVec(basisKernelSums);
338 
339  bool isDivideByZero = false;
340  GpuMemOwner<bool> isDivideByZeroGPU;
341  isDivideByZeroGPU.Transfer(&isDivideByZero, 1);
342 
343  gpu::Call_NormalizationImageValues(
344  normGPU, outImage.width, outImage.height,
345  sFnValGPU, kernelN,
346  basisKernelSumsGPU.ptr,
347  isDivideByZeroGPU.ptr,
348  blockN,
349  shMemSize
350  );
351  cudaThreadSynchronize();
352  if (cudaGetLastError() != cudaSuccess) {
353  throw LSST_EXCEPT(GpuRuntimeError, "GPU calculation failed to run");
354  }
355  CopyFromGpu<bool>(&isDivideByZero, isDivideByZeroGPU.ptr, 1);
356  if (isDivideByZero) {
357  throw LSST_EXCEPT(pexExcept::OverflowError, "Cannot normalize; kernel sum is 0");
358  }
359  }
360 
361  cudaGetLastError(); //clear error status
362  mathDetailGpu::Call_ConvolutionKernel_LC_Img(
363  inImageGPU.ptr, inImage.width, inImage.height,
364  basisKernelsListGPU, kernelN,
365  kernelW, kernelH,
366  &sFnValGPU[0],
367  normGPU,
368  outImageGPU.ptr,
369  blockN,
370  shMemSize
371  );
372  cudaThreadSynchronize();
373  if (cudaGetLastError() != cudaSuccess) {
374  throw LSST_EXCEPT(GpuRuntimeError, "GPU calculation failed to run");
375  }
376 
377  CopyFromGpu(outImage.img, outImageGPU.ptr, outImage.Size() );
378 
379 }
380 
381 } //local namespace ends
382 
383 template <typename OutPixelT, typename InPixelT>
384 void GPU_ConvolutionImage_LinearCombinationKernel(
385  GpuBuffer2D<InPixelT>& inImage,
386  vector<double> colPos,
387  vector<double> rowPos,
388  std::vector< afwMath::Kernel::SpatialFunctionPtr > sFn,
389  GpuBuffer2D<OutPixelT>& outImage,
390  std::vector< GpuBuffer2D<KerPixel> >& basisKernels,
391  SpatialFunctionType_t sfType,
392  bool doNormalize
393 )
394 {
395  assert(basisKernels.size() == sFn.size());
396 
397  int outWidth = outImage.width;
398  int outHeight = outImage.height;
399 
400  const int kernelN = sFn.size();
401  const int kernelW = basisKernels[0].width;
402  const int kernelH = basisKernels[0].height;
403  const int kernelSize = kernelW * kernelH;
404 
405  for (int i = 0; i < kernelN; i++) {
406  assert(kernelW == basisKernels[i].width);
407  assert(kernelH == basisKernels[i].height);
408  }
409 
410  // transfer array of basis kernels on GPU
411  GpuMemOwner<KerPixel > basisKernelsGPU;
412  basisKernelsGPU.Alloc(kernelSize * kernelN);
413 
414  for (int i = 0; i < kernelN; i++) {
415  KerPixel* kernelBeg = basisKernelsGPU.ptr + (kernelSize * i);
416  CopyToGpu(kernelBeg,
417  basisKernels[i].img,
418  kernelSize
419  );
420  }
421 
422  // allocate array of spatial function value images on GPU
423  vector< double* > sFnValGPUPtr(kernelN);
424  vector< GpuMemOwner<double > > sFnValGPU_Owner(kernelN);
425 
426  for (int i = 0; i < kernelN; i++) {
427  sFnValGPUPtr[i] = sFnValGPU_Owner[i].Alloc(outWidth * outHeight);
428  if (sFnValGPUPtr[i] == NULL) {
429  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU for spatial function values");
430  }
431  }
432  GpuMemOwner<double*> sFnValGPU;
433  sFnValGPU.TransferVec(sFnValGPUPtr);
434 
435  GpuMemOwner<double > normGPU_Owner;
436  vector<double> basisKernelSums(kernelN);
437  if (doNormalize) {
438  //allocate normalization coeficients
439  normGPU_Owner.Alloc(outWidth * outHeight);
440  if (normGPU_Owner.ptr == NULL) {
441  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU for normalization coefficients");
442  }
443 
444  //calculate basis kernel sums
445  basisKernelSums = SumsOfImages<double, KerPixel>(basisKernels);
446  }
447 
448  GPU_ConvolutionImage_LC_Img(
449  inImage,
450  colPos, rowPos,
451  sFn,
452  sFnValGPUPtr, //output
453  sFnValGPU.ptr, //output
454  sfType,
455  outImage, //output
456  basisKernelsGPU.ptr,
457  kernelW, kernelH,
458  basisKernelSums, //input
459  normGPU_Owner.ptr, //output
460  doNormalize
461  );
462 
463 }
464 
465 #define INSTANTIATE_GPU_ConvolutionImage_LinearCombinationKernel(OutPixelT,InPixelT) \
466  template void GPU_ConvolutionImage_LinearCombinationKernel<OutPixelT,InPixelT>( \
467  GpuBuffer2D<InPixelT>& inImage, \
468  vector<double> colPos, \
469  vector<double> rowPos, \
470  std::vector< afwMath::Kernel::SpatialFunctionPtr > sFn, \
471  GpuBuffer2D<OutPixelT>& outImage, \
472  std::vector< GpuBuffer2D<KerPixel> >& basisKernels, \
473  SpatialFunctionType_t sfType, \
474  bool doNormalize \
475  );
476 
477 template <typename OutPixelT, typename InPixelT>
478 void GPU_ConvolutionMI_LinearCombinationKernel(
479  GpuBuffer2D<InPixelT>& inImageImg,
480  GpuBuffer2D<VarPixel>& inImageVar,
481  GpuBuffer2D<MskPixel>& inImageMsk,
482  vector<double> colPos,
483  vector<double> rowPos,
484  std::vector< afwMath::Kernel::SpatialFunctionPtr > sFn,
485  GpuBuffer2D<OutPixelT>& outImageImg,
486  GpuBuffer2D<VarPixel>& outImageVar,
487  GpuBuffer2D<MskPixel>& outImageMsk,
488  std::vector< GpuBuffer2D<KerPixel> >& basisKernels,
489  SpatialFunctionType_t sfType,
490  bool doNormalize
491 )
492 {
493  assert(basisKernels.size() == sFn.size());
494  assert(outImageImg.width == outImageVar.width);
495  assert(outImageImg.width == outImageMsk.width);
496  assert(outImageImg.height == outImageVar.height);
497  assert(outImageImg.height == outImageMsk.height);
498 
499  int outWidth = outImageImg.width;
500  int outHeight = outImageImg.height;
501 
502  const int kernelN = sFn.size();
503  const int kernelW = basisKernels[0].width;
504  const int kernelH = basisKernels[0].height;
505  const int kernelSize = kernelW * kernelH;
506 
507  for (int i = 0; i < kernelN; i++) {
508  assert(kernelW == basisKernels[i].width);
509  assert(kernelH == basisKernels[i].height);
510  }
511 
512  // transfer basis kernels to GPU
513  GpuMemOwner<KerPixel > basisKernelsGPU;
514  basisKernelsGPU.Alloc(kernelSize * kernelN);
515 
516  for (int i = 0; i < kernelN; i++) {
517  KerPixel* kernelBeg = basisKernelsGPU.ptr + (kernelSize * i);
518  CopyToGpu(kernelBeg,
519  basisKernels[i].img,
520  kernelSize
521  );
522  }
523 
524  //alloc sFn images on GPU
525  vector< double* > sFnValGPUPtr(kernelN);
526  vector< GpuMemOwner<double > > sFnValGPU_Owner(kernelN);
527 
528  for (int i = 0; i < kernelN; i++) {
529  sFnValGPUPtr[i] = sFnValGPU_Owner[i].Alloc(outWidth * outHeight);
530  if (sFnValGPUPtr[i] == NULL) {
531  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU for spatial function values");
532  }
533  }
534  GpuMemOwner<double*> sFnValGPU;
535  sFnValGPU.TransferVec(sFnValGPUPtr);
536 
537  //allocate normalization coeficients image on GPU
538  GpuMemOwner<double > normGPU_Owner;
539  std::vector<KerPixel> basisKernelSums(kernelN);
540  if (doNormalize) {
541  //allocate normalization coeficients
542  normGPU_Owner.Alloc(outWidth * outHeight);
543  if (normGPU_Owner.ptr == NULL) {
544  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU for normalization coefficients");
545  }
546  //calculate basis kernel sums
547  basisKernelSums = SumsOfImages<double, KerPixel>(basisKernels);
548  }
549 
550  GPU_ConvolutionImage_LC_Img(
551  inImageImg,
552  colPos, rowPos,
553  sFn,
554  sFnValGPUPtr, //output
555  sFnValGPU.ptr, //output
556  sfType,
557  outImageImg, //output
558  basisKernelsGPU.ptr,
559  kernelW, kernelH,
560  basisKernelSums, //input
561  normGPU_Owner.ptr, //output
562  doNormalize
563  );
564 
565  //transfer input image planes to GPU
566  GpuMemOwner<VarPixel> inImageGPUVar;
567  inImageGPUVar.Transfer(inImageVar);
568  if (inImageGPUVar.ptr == NULL) {
569  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU for input variance");
570  }
571  GpuMemOwner<MskPixel> inImageGPUMsk;
572  inImageGPUMsk.Transfer(inImageMsk);
573  if (inImageGPUMsk.ptr == NULL) {
574  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU for input mask");
575  }
576 
577  // allocate output image planes on GPU
578  GpuMemOwner<VarPixel > outImageGPUVar;
579  outImageGPUVar.Alloc( outImageVar.Size());
580  if (outImageGPUVar.ptr == NULL) {
581  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU for output variance");
582  }
583  GpuMemOwner<MskPixel > outImageGPUMsk;
584  outImageGPUMsk.Alloc( outImageMsk.Size());
585  if (outImageGPUMsk.ptr == NULL) {
586  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU for output mask");
587  }
588  int shMemSize = GetCudaCurSMSharedMemorySize() - shMemBytesUsed;
589  int blockN = CalcBlockCount( GetCudaCurSMCount());
590 
591  cudaGetLastError(); //clear error status
592  mathDetailGpu::Call_ConvolutionKernel_LC_Var(
593  inImageGPUVar.ptr, inImageVar.width, inImageVar.height,
594  inImageGPUMsk.ptr,
595  basisKernelsGPU.ptr, kernelN,
596  kernelW, kernelH,
597  sFnValGPU.ptr,
598  normGPU_Owner.ptr,
599  outImageGPUVar.ptr,
600  outImageGPUMsk.ptr,
601  blockN,
602  shMemSize
603  );
604  cudaThreadSynchronize();
605  if (cudaGetLastError() != cudaSuccess)
606  throw LSST_EXCEPT(GpuRuntimeError, "GPU calculation failed to run");
607 
608  CopyFromGpu(outImageVar.img, outImageGPUVar.ptr, outImageVar.Size() );
609  CopyFromGpu(outImageMsk.img, outImageGPUMsk.ptr, outImageMsk.Size() );
610 }
611 
612 #define INSTANTIATE_GPU_ConvolutionMI_LinearCombinationKernel(OutPixelT,InPixelT) \
613  template void GPU_ConvolutionMI_LinearCombinationKernel<OutPixelT,InPixelT>( \
614  GpuBuffer2D<InPixelT>& inImageImg, \
615  GpuBuffer2D<VarPixel>& inImageVar, \
616  GpuBuffer2D<MskPixel>& inImageMsk, \
617  vector<double> colPos, \
618  vector<double> rowPos, \
619  std::vector< afwMath::Kernel::SpatialFunctionPtr > sFn, \
620  GpuBuffer2D<OutPixelT>& outImageImg, \
621  GpuBuffer2D<VarPixel>& outImageVar, \
622  GpuBuffer2D<MskPixel>& outImageMsk, \
623  std::vector< GpuBuffer2D<KerPixel> >& basisKernels, \
624  SpatialFunctionType_t sfType, \
625  bool doNormalize \
626  );
627 
628 
629 template <typename OutPixelT, typename InPixelT>
630 void GPU_ConvolutionImage_SpatiallyInvariantKernel(
631  GpuBuffer2D<InPixelT>& inImage,
632  GpuBuffer2D<OutPixelT>& outImage,
633  GpuBuffer2D<KerPixel>& kernel
634 )
635 {
636  int kernelW = kernel.width;
637  int kernelH = kernel.height;
638 
639  GpuMemOwner<InPixelT> inImageGPU;
640  inImageGPU.Transfer(inImage);
641  if (inImageGPU.ptr == NULL) {
642  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU for input image");
643  }
644  int shMemSize = GetCudaCurSMSharedMemorySize() - shMemBytesUsed;
645 
646  // allocate array of kernels on GPU
647  GpuMemOwner<KerPixel > basisKernelGPU;
648  basisKernelGPU.Transfer(kernel);
649  if (basisKernelGPU.ptr == NULL) {
650  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU available for kernel");
651  }
652  // allocate array of output images on GPU (one output image per kernel)
653  vector< OutPixelT* > outImageGPUPtr(1);
654  vector< GpuMemOwner<OutPixelT> > outImageGPU_Owner(1);
655 
656  outImageGPUPtr[0] = outImageGPU_Owner[0].Alloc( outImage.Size());
657  if (outImageGPUPtr[0] == NULL) {
658  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU available for output image");
659  }
660  GpuMemOwner<OutPixelT*> outImageGPU;
661  outImageGPU.TransferVec(outImageGPUPtr);
662 
663  int blockN = CalcBlockCount( GetCudaCurSMCount());
664 
665  cudaGetLastError(); //clear error status
666  mathDetailGpu::Call_SpatiallyInvariantImageConvolutionKernel<OutPixelT, InPixelT>(
667  inImageGPU.ptr, inImage.width, inImage.height,
668  basisKernelGPU.ptr, 1,
669  kernelW, kernelH,
670  outImageGPU.ptr,
671  blockN,
672  shMemSize
673  );
674  cudaThreadSynchronize();
675  if (cudaGetLastError() != cudaSuccess) {
676  throw LSST_EXCEPT(GpuRuntimeError, "GPU calculation failed to run");
677  }
678  CopyFromGpu(outImage.img, outImageGPUPtr[0], outImage.Size() );
679 }
680 
681 #define INSTANTIATE_GPU_ConvolutionImage_SpatiallyInvariantKernel(OutPixelT,InPixelT) \
682  template void GPU_ConvolutionImage_SpatiallyInvariantKernel<OutPixelT,InPixelT>( \
683  GpuBuffer2D<InPixelT>& inImage, \
684  GpuBuffer2D<OutPixelT>& outImage, \
685  GpuBuffer2D<KerPixel>& kernel \
686  );
687 
688 template <typename OutPixelT, typename InPixelT>
689 void GPU_ConvolutionMI_SpatiallyInvariantKernel(
690  GpuBuffer2D<InPixelT>& inImageImg,
691  GpuBuffer2D<VarPixel>& inImageVar,
692  GpuBuffer2D<MskPixel>& inImageMsk,
693  GpuBuffer2D<OutPixelT>& outImageImg,
694  GpuBuffer2D<VarPixel>& outImageVar,
695  GpuBuffer2D<MskPixel>& outImageMsk,
696  GpuBuffer2D<KerPixel>& kernel
697 )
698 {
699  int kernelW = kernel.width;
700  int kernelH = kernel.height;
701 
702  GpuMemOwner<InPixelT> inImageGPUImg;
703  inImageGPUImg.Transfer(inImageImg);
704  if (inImageGPUImg.ptr == NULL) {
705  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU available for input image");
706  }
707  GpuMemOwner<VarPixel> inImageGPUVar;
708  inImageGPUVar.Transfer(inImageVar);
709  if (inImageGPUVar.ptr == NULL) {
710  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU available for input variance");
711  }
712  GpuMemOwner<MskPixel> inImageGPUMsk;
713  inImageGPUMsk.Transfer(inImageMsk);
714  if (inImageGPUMsk.ptr == NULL) {
715  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU available for input mask");
716  }
717  int shMemSize = GetCudaCurSMSharedMemorySize() - shMemBytesUsed;
718 
719  //allocate kernel on GPU
720  GpuMemOwner<KerPixel > basisKernelGPU;
721  basisKernelGPU.Transfer(kernel);
722  if (basisKernelGPU.ptr == NULL)
723  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU available for kernel");
724 
725  // allocate array of output image planes on GPU
726  vector< OutPixelT* > outImageGPUPtrImg(1);
727  vector< VarPixel* > outImageGPUPtrVar(1);
728  vector< MskPixel* > outImageGPUPtrMsk(1);
729 
730  vector< GpuMemOwner<OutPixelT> > outImageGPU_OwnerImg(1);
731  vector< GpuMemOwner<VarPixel > > outImageGPU_OwnerVar(1);
732  vector< GpuMemOwner<MskPixel > > outImageGPU_OwnerMsk(1);
733 
734  outImageGPUPtrImg[0] = outImageGPU_OwnerImg[0].Alloc( outImageImg.Size());
735  if (outImageGPUPtrImg[0] == NULL) {
736  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU available for output image");
737  }
738  outImageGPUPtrVar[0] = outImageGPU_OwnerVar[0].Alloc( outImageVar.Size());
739  if (outImageGPUPtrVar[0] == NULL) {
740  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU available for output variance");
741  }
742  outImageGPUPtrMsk[0] = outImageGPU_OwnerMsk[0].Alloc( outImageMsk.Size());
743  if (outImageGPUPtrMsk[0] == NULL) {
744  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU available for output mask");
745  }
746 
747  GpuMemOwner<OutPixelT*> outImageGPUImg;
748  outImageGPUImg.TransferVec(outImageGPUPtrImg);
749  GpuMemOwner<VarPixel*> outImageGPUVar;
750  outImageGPUVar.TransferVec(outImageGPUPtrVar);
751  GpuMemOwner<MskPixel*> outImageGPUMsk;
752  outImageGPUMsk.TransferVec(outImageGPUPtrMsk);
753 
754  int blockN = CalcBlockCount( GetCudaCurSMCount());
755 
756  mathDetailGpu::Call_SpatiallyInvariantImageConvolutionKernel<OutPixelT, InPixelT>(
757  inImageGPUImg.ptr, inImageImg.width, inImageImg.height,
758  basisKernelGPU.ptr, 1,
759  kernelW, kernelH,
760  outImageGPUImg.ptr,
761  blockN,
762  shMemSize
763  );
764  //square kernel
765  for (int y = 0; y < kernelH; y++) {
766  for (int x = 0; x < kernelW; x++) {
767  kernel.Pixel(x, y) *= kernel.Pixel(x, y);
768  }
769  }
770 
771  CopyFromGpu(outImageImg.img, outImageGPUPtrImg[0], outImageImg.Size() );
772 
773  basisKernelGPU.CopyToGpu(kernel);
774 
775  cudaGetLastError(); //clear last error
776 
777  mathDetailGpu::Call_SpatiallyInvariantImageConvolutionKernel<VarPixel, VarPixel>(
778  inImageGPUVar.ptr, inImageVar.width, inImageVar.height,
779  basisKernelGPU.ptr, 1,
780  kernelW, kernelH,
781  outImageGPUVar.ptr,
782  blockN,
783  shMemSize
784  );
785 
786  cudaThreadSynchronize();
787  if (cudaGetLastError() != cudaSuccess) {
788  throw LSST_EXCEPT(GpuRuntimeError, "GPU variance calculation failed to run");
789  }
790  mathDetailGpu::Call_SpatiallyInvariantMaskConvolutionKernel(
791  inImageGPUMsk.ptr, inImageMsk.width, inImageMsk.height,
792  basisKernelGPU.ptr, 1,
793  kernelW, kernelH,
794  outImageGPUMsk.ptr,
795  blockN,
796  shMemSize
797  );
798  cudaThreadSynchronize();
799  if (cudaGetLastError() != cudaSuccess) {
800  throw LSST_EXCEPT(GpuRuntimeError, "GPU mask calculation failed to run");
801  }
802  CopyFromGpu(outImageVar.img, outImageGPUPtrVar[0], outImageVar.Size() );
803  CopyFromGpu(outImageMsk.img, outImageGPUPtrMsk[0], outImageMsk.Size() );
804 }
805 
806 #define INSTANTIATE_GPU_ConvolutionMI_SpatiallyInvariantKernel(OutPixelT,InPixelT) \
807  template void GPU_ConvolutionMI_SpatiallyInvariantKernel<OutPixelT,InPixelT>( \
808  GpuBuffer2D<InPixelT>& inImageImg, \
809  GpuBuffer2D<VarPixel>& inImageVar, \
810  GpuBuffer2D<MskPixel>& inImageMsk, \
811  GpuBuffer2D<OutPixelT>& outImageImg, \
812  GpuBuffer2D<VarPixel>& outImageVar, \
813  GpuBuffer2D<MskPixel>& outImageMsk, \
814  GpuBuffer2D<KerPixel>& kernel \
815  );
816 
817 /*
818  * Explicit instantiation
819  */
821 
822 #define INSTANTIATE(OutPixelT,InPixelT) \
823  INSTANTIATE_GPU_ConvolutionImage_LinearCombinationKernel(OutPixelT,InPixelT) \
824  INSTANTIATE_GPU_ConvolutionMI_LinearCombinationKernel(OutPixelT,InPixelT) \
825  INSTANTIATE_GPU_ConvolutionImage_SpatiallyInvariantKernel(OutPixelT,InPixelT) \
826  INSTANTIATE_GPU_ConvolutionMI_SpatiallyInvariantKernel(OutPixelT,InPixelT)
827 
828 
829 INSTANTIATE(double, double)
830 INSTANTIATE(double, float)
831 INSTANTIATE(double, int)
832 INSTANTIATE(double, std::uint16_t)
833 INSTANTIATE(float, float)
834 INSTANTIATE(float, int)
835 INSTANTIATE(float, std::uint16_t)
836 INSTANTIATE(int, int)
837 INSTANTIATE(std::uint16_t, std::uint16_t)
839 
840 }
841 }
842 }
843 } //namespace lsst::afw::math::detail ends
844 
845 #endif //GPU_BUILD
846 
847 
848 
int y
2-dimensional weighted sum of Chebyshev polynomials of the first kind.
Convolution support.
Declare the Kernel class and subclasses.
Class for representing an image or 2D array in general)
Definition: GpuBuffer2D.h:54
bool IsSufficientSharedMemoryAvailable_ForImgBlock(int filterW, int filterH, int pixSize)
2-dimensional polynomial function with cross terms
Define a collection of useful Functions.
contains GpuBuffer2D class (for simple handling of images or 2D arrays)
#define INSTANTIATE(T)
table::Key< table::Array< Kernel::Pixel > > image
Definition: FixedKernel.cc:117
int GetCudaCurSMSharedMemorySize()
returns shared memory size per block of currently selected cuda device
GPU convolution code.
Set up for convolution, calls GPU convolution kernels.
double w
Definition: CoaddPsf.cc:57
lsst::afw::geom::Box2D getXYRange() const
Get x,y range.
int getOrder() const
Get the polynomial order.
Definition: Function.h:422
void TestGpuKernel(int &ret1, int &ret2)
double x
bool IsSufficientSharedMemoryAvailable_ForSfn(int order, int kernelN)
Functions to help managing setup for GPU kernels.
int GetCudaCurSMCount()
returns the number of streaming multiprocessors of currently selected cuda device ...
#define LSST_EXCEPT(type,...)
Definition: Exception.h:46
PixelT & Pixel(int x, int y)
Definition: GpuBuffer2D.h:133
lsst::afw::image::MaskPixel MskPixel
Definition: convCUDA.h:45
A floating-point coordinate rectangle geometry.
Definition: Box.h:271
Functions to query the properties of currently selected GPU device.
Functions and a class to help allocating GPU global memory and transferring data to and from a GPU...
Definition of default types for Masks and Variance Images.
bool IsSufficientSharedMemoryAvailable_ForImgAndMaskBlock(int filterW, int filterH, int pixSize)