LSSTApplications  10.0-2-g4f67435,11.0.rc2+1,11.0.rc2+12,11.0.rc2+3,11.0.rc2+4,11.0.rc2+5,11.0.rc2+6,11.0.rc2+7,11.0.rc2+8
LSSTDataManagementBasePackage
cudaConvWrapper.cc
Go to the documentation of this file.
1 // -*- LSST-C++ -*-
2 
3 /*
4  * LSST Data Management System
5  * Copyright 2008, 2009, 2010 LSST Corporation.
6  *
7  * This product includes software developed by the
8  * LSST Project (http://www.lsst.org/).
9  *
10  * This program is free software: you can redistribute it and/or modify
11  * it under the terms of the GNU General Public License as published by
12  * the Free Software Foundation, either version 3 of the License, or
13  * (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18  * GNU General Public License for more details.
19  *
20  * You should have received a copy of the LSST License Statement and
21  * the GNU General Public License along with this program. If not,
22  * see <http://www.lsstcorp.org/LegalNotices/>.
23  */
24 
38 #ifndef GPU_BUILD
39 
40 namespace lsst {
41 namespace afw {
42 namespace math {
43 namespace detail {
44 
45 
46 void TestGpuKernel(int& ret1, int& ret2) {
47  ret1 = 0;
48  ret2 = 0;
49 }
50 
51 bool IsSufficientSharedMemoryAvailable_ForImgBlock(int filterW, int filterH, int pixSize)
52 {
53  return false;
54 }
55 bool IsSufficientSharedMemoryAvailable_ForImgAndMaskBlock(int filterW, int filterH, int pixSize)
56 {
57  return false;
58 }
59 bool IsSufficientSharedMemoryAvailable_ForSfn(int order, int kernelN)
60 {
61  return false;
62 }
63 
64 }
65 }
66 }
67 }
68 
69 #else
70 
71 #include <cuda.h>
72 #include <cuda_runtime.h>
73 #include <stdio.h>
74 
76 #include "lsst/afw/math/Kernel.h"
78 
86 
87 using namespace std;
88 using namespace lsst::afw::gpu;
89 using namespace lsst::afw::gpu::detail;
90 namespace afwImage = lsst::afw::image;
91 namespace afwMath = lsst::afw::math;
92 namespace mathDetailGpu = lsst::afw::math::detail::gpu;
93 
94 
95 
96 namespace lsst {
97 namespace afw {
98 namespace math {
99 namespace detail {
100 
101 namespace {
102 const int shMemBytesUsed = 200;
103 }
104 
105 
106 // Returns true if there is sufficient shared memory for loading an image block,
107 // where image block includes including filter frame.
108 bool IsSufficientSharedMemoryAvailable_ForImgBlock(int filterW, int filterH, int pixSize)
109 {
110  int shMemSize = GetCudaCurSMSharedMemorySize();
111  int bufferSize = (filterW + blockSizeX - 1) * (filterH + blockSizeY - 1) * pixSize;
112 
113  return shMemSize - shMemBytesUsed - bufferSize > 0;
114 }
115 
116 // Returns true if there is sufficient shared memory for loading an image block,
117 // and acommpanying block of mask data (mask image block),
118 // where image block and mask image block include including filter frame.
119 bool IsSufficientSharedMemoryAvailable_ForImgAndMaskBlock(int filterW, int filterH, int pixSize)
120 {
121  int shMemSize = GetCudaCurSMSharedMemorySize();
122  int imgBufferSize = (filterW + blockSizeX - 1) * (filterH + blockSizeY - 1) * pixSize;
123  int mskBufferSize = (filterW + blockSizeX - 1) * (filterH + blockSizeY - 1) * sizeof(MskPixel);
124 
125  int memRemaining = shMemSize - shMemBytesUsed - imgBufferSize - mskBufferSize ;
126 
127  return memRemaining > 0;
128 }
129 
130 // Returns true if there is sufficient shared memory for loading
131 // parameters and temporary values for Chebyshev function and normalization
132 bool IsSufficientSharedMemoryAvailable_ForSfn(int order, int kernelN)
133 {
134  int shMemSize = GetCudaCurSMSharedMemorySize();
135 
136  const int coeffN = order + 1;
137  const int coeffPadding = coeffN + 1 - (coeffN % 2);
138  int paramN = (order + 1) * (order + 2) / 2;
139 
140  int yCoeffsAll = coeffPadding * blockSizeX * sizeof(double);
141  int xChebyAll = coeffPadding * blockSizeX * sizeof(double);
142  int smemParams = paramN * sizeof(double);
143 
144  int smemKernelSum = kernelN * sizeof(double);
145  int smemSfnPtr = kernelN * sizeof(double*);
146 
147  int memRemainingSfn = shMemSize - shMemBytesUsed - yCoeffsAll - xChebyAll - smemParams ;
148  int memRemainingNorm = shMemSize - shMemBytesUsed - smemKernelSum - smemSfnPtr;
149 
150  return min(memRemainingSfn, memRemainingNorm) > 0;
151 }
152 
153 // This function decides on the best GPU block count
154 // uses simple heuristics (not to much blocks and not too many)
155 // but guarantees that number of blocks will be a multiple of number of multiprocessors
156 int CalcBlockCount(int multiprocCount)
157 {
158  if (multiprocCount < 12) return multiprocCount * 4;
159  if (multiprocCount < 24) return multiprocCount * 2;
160  return multiprocCount;
161 }
162 
163 
164 // calls test gpu kernel
165 // should return 5 and 8 in ret1 and ret2
166 void TestGpuKernel(int& ret1, int& ret2)
167 {
168  int res[2];
169 
170  GpuMemOwner<int> resGpu;
171  resGpu.Alloc(2);
172 
173  gpu::CallTestGpuKernel(resGpu.ptr);
174 
175  resGpu.CopyFromGpu(res);
176 
177  ret1 = res[0];
178  ret2 = res[1];
179 }
180 
181 namespace {
182 
183 //calculates sum of each image in 'images' vector
184 template <typename ResultT, typename InT>
185 vector<ResultT> SumsOfImages(const vector< GpuBuffer2D<InT> >& images)
186 {
187  int n = int(images.size());
188  vector<ResultT> sum(n);
189  for (int i = 0; i < n; i++) {
190  ResultT totalSum = 0;
191  int h = images[i].height;
192  int w = images[i].width;
193 
194  for (int y = 0; y < h; y++) {
195  ResultT rowSum = 0;
196  for (int x = 0; x < w; x++) {
197  rowSum += images[i].Pixel(x, y);
198  }
199  totalSum += rowSum;
200  }
201  sum[i] = totalSum;
202  }
203  return sum;
204 }
205 
220 template <typename OutPixelT, typename InPixelT>
221 void GPU_ConvolutionImage_LC_Img(
222  const GpuBuffer2D<InPixelT>& inImage,
223  const vector<double>& colPos,
224  const vector<double>& rowPos,
225  const std::vector< afwMath::Kernel::SpatialFunctionPtr >& sFn,
226  const vector<double*>& sFnValGPUPtr, //output
227  double** sFnValGPU, //output
228  SpatialFunctionType_t sfType,
229  GpuBuffer2D<OutPixelT>& outImage, //output
230  KerPixel* basisKernelsListGPU,
231  int kernelW, int kernelH,
232  const vector<double>& basisKernelSums, //input
233  double* normGPU, //output
234  bool doNormalize
235 )
236 {
237  const int kernelN = sFn.size();
238 
239  //transfer input image
240  GpuMemOwner<InPixelT > inImageGPU;
241  inImageGPU.Transfer(inImage);
242  if (inImageGPU.ptr == NULL) {
243  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU for input image");
244  }
245  // allocate output image planes on GPU
246  GpuMemOwner<OutPixelT> outImageGPU;
247  outImageGPU.Alloc( outImage.Size());
248  if (outImageGPU.ptr == NULL) {
249  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU for output image");
250  }
251  //transfer coordinate tranform data
252  GpuMemOwner<double> colPosGPU_Owner;
253  colPosGPU_Owner.TransferVec(colPos);
254  if (colPosGPU_Owner.ptr == NULL) {
255  throw LSST_EXCEPT(GpuMemoryError,
256  "Not enough memory on GPU for row coordinate tranformation data");
257  }
258  GpuMemOwner<double> rowPosGPU_Owner;
259  rowPosGPU_Owner.TransferVec(rowPos);
260  if (rowPosGPU_Owner.ptr == NULL) {
261  throw LSST_EXCEPT(GpuMemoryError,
262  "Not enough memory on GPU for column coordinate tranformation data");
263  }
264  vector< double* > sFnParamsGPUPtr(kernelN);
265  vector< GpuMemOwner<double> > sFnParamsGPU_Owner(kernelN);
266 
267  //transfer sfn parameters to GPU
268  for (int i = 0; i < kernelN; i++) {
269  std::vector<double> spatialParams = sFn[i]->getParameters();
270  sFnParamsGPUPtr[i] = sFnParamsGPU_Owner[i].TransferVec(spatialParams);
271  if (sFnParamsGPUPtr[i] == NULL) {
272  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU for spatial function parameters");
273  }
274  }
275 
276  int shMemSize = GetCudaCurSMSharedMemorySize() - shMemBytesUsed;
277 
278  for (int i = 0; i < kernelN; i++)
279  {
280  cudaGetLastError(); //clear error status
281 
282  if (sfType == sftPolynomial) {
283  const afwMath::PolynomialFunction2<double>* polySfn =
284  dynamic_cast<const afwMath::PolynomialFunction2<double>*>( sFn[i].get() );
285 
286  gpu::Call_PolynomialImageValues(
287  sFnValGPUPtr[i], outImage.width, outImage.height,
288  polySfn->getOrder(),
289  sFnParamsGPU_Owner[i].ptr,
290  rowPosGPU_Owner.ptr,
291  colPosGPU_Owner.ptr,
292  shMemSize
293  );
294 
295  //cudaThreadSynchronize();
296  cudaError_t cuda_err = cudaGetLastError();
297  if (cuda_err != cudaSuccess) {
298  throw LSST_EXCEPT(GpuRuntimeError, "GPU calculation failed to run");
299  }
300  }
301  if (sfType == sftChebyshev) {
302  const afwMath::Chebyshev1Function2<double>* chebSfn =
303  dynamic_cast<const afwMath::Chebyshev1Function2<double>*>( sFn[i].get() );
304 
305  lsst::afw::geom::Box2D const xyRange = chebSfn->getXYRange();
306 
307  gpu::Call_ChebyshevImageValues(
308  sFnValGPUPtr[i], outImage.width, outImage.height,
309  chebSfn->getOrder(),
310  sFnParamsGPU_Owner[i].ptr,
311  rowPosGPU_Owner.ptr,
312  colPosGPU_Owner.ptr,
313  xyRange.getMinX(), xyRange.getMinY(), xyRange.getMaxX(), xyRange.getMaxY(),
314  shMemSize
315  );
316 
317  //cudaThreadSynchronize();
318  cudaError_t cuda_err = cudaGetLastError();
319  if (cuda_err != cudaSuccess) {
320  throw LSST_EXCEPT(GpuRuntimeError, "GPU calculation failed to run");
321  }
322  }
323  }
324  cudaThreadSynchronize();
325  cudaError_t cuda_err = cudaGetLastError();
326  if (cuda_err != cudaSuccess) {
327  throw LSST_EXCEPT(GpuRuntimeError, "GPU calculation failed to run");
328  }
329 
330  int blockN = CalcBlockCount( GetCudaCurSMCount());
331 
332  //transfer basis kernel sums
333  if (doNormalize) {
334  GpuMemOwner<double> basisKernelSumsGPU;
335  basisKernelSumsGPU.TransferVec(basisKernelSums);
336 
337  bool isDivideByZero = false;
338  GpuMemOwner<bool> isDivideByZeroGPU;
339  isDivideByZeroGPU.Transfer(&isDivideByZero, 1);
340 
341  gpu::Call_NormalizationImageValues(
342  normGPU, outImage.width, outImage.height,
343  sFnValGPU, kernelN,
344  basisKernelSumsGPU.ptr,
345  isDivideByZeroGPU.ptr,
346  blockN,
347  shMemSize
348  );
349  cudaThreadSynchronize();
350  if (cudaGetLastError() != cudaSuccess) {
351  throw LSST_EXCEPT(GpuRuntimeError, "GPU calculation failed to run");
352  }
353  CopyFromGpu<bool>(&isDivideByZero, isDivideByZeroGPU.ptr, 1);
354  if (isDivideByZero) {
355  throw LSST_EXCEPT(pexExcept::OverflowError, "Cannot normalize; kernel sum is 0");
356  }
357  }
358 
359  cudaGetLastError(); //clear error status
360  mathDetailGpu::Call_ConvolutionKernel_LC_Img(
361  inImageGPU.ptr, inImage.width, inImage.height,
362  basisKernelsListGPU, kernelN,
363  kernelW, kernelH,
364  &sFnValGPU[0],
365  normGPU,
366  outImageGPU.ptr,
367  blockN,
368  shMemSize
369  );
370  cudaThreadSynchronize();
371  if (cudaGetLastError() != cudaSuccess) {
372  throw LSST_EXCEPT(GpuRuntimeError, "GPU calculation failed to run");
373  }
374 
375  CopyFromGpu(outImage.img, outImageGPU.ptr, outImage.Size() );
376 
377 }
378 
379 } //local namespace ends
380 
381 template <typename OutPixelT, typename InPixelT>
382 void GPU_ConvolutionImage_LinearCombinationKernel(
383  GpuBuffer2D<InPixelT>& inImage,
384  vector<double> colPos,
385  vector<double> rowPos,
386  std::vector< afwMath::Kernel::SpatialFunctionPtr > sFn,
387  GpuBuffer2D<OutPixelT>& outImage,
388  std::vector< GpuBuffer2D<KerPixel> >& basisKernels,
389  SpatialFunctionType_t sfType,
390  bool doNormalize
391 )
392 {
393  assert(basisKernels.size() == sFn.size());
394 
395  int outWidth = outImage.width;
396  int outHeight = outImage.height;
397 
398  const int kernelN = sFn.size();
399  const int kernelW = basisKernels[0].width;
400  const int kernelH = basisKernels[0].height;
401  const int kernelSize = kernelW * kernelH;
402 
403  for (int i = 0; i < kernelN; i++) {
404  assert(kernelW == basisKernels[i].width);
405  assert(kernelH == basisKernels[i].height);
406  }
407 
408  // transfer array of basis kernels on GPU
409  GpuMemOwner<KerPixel > basisKernelsGPU;
410  basisKernelsGPU.Alloc(kernelSize * kernelN);
411 
412  for (int i = 0; i < kernelN; i++) {
413  KerPixel* kernelBeg = basisKernelsGPU.ptr + (kernelSize * i);
414  CopyToGpu(kernelBeg,
415  basisKernels[i].img,
416  kernelSize
417  );
418  }
419 
420  // allocate array of spatial function value images on GPU
421  vector< double* > sFnValGPUPtr(kernelN);
422  vector< GpuMemOwner<double > > sFnValGPU_Owner(kernelN);
423 
424  for (int i = 0; i < kernelN; i++) {
425  sFnValGPUPtr[i] = sFnValGPU_Owner[i].Alloc(outWidth * outHeight);
426  if (sFnValGPUPtr[i] == NULL) {
427  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU for spatial function values");
428  }
429  }
430  GpuMemOwner<double*> sFnValGPU;
431  sFnValGPU.TransferVec(sFnValGPUPtr);
432 
433  GpuMemOwner<double > normGPU_Owner;
434  vector<double> basisKernelSums(kernelN);
435  if (doNormalize) {
436  //allocate normalization coeficients
437  normGPU_Owner.Alloc(outWidth * outHeight);
438  if (normGPU_Owner.ptr == NULL) {
439  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU for normalization coefficients");
440  }
441 
442  //calculate basis kernel sums
443  basisKernelSums = SumsOfImages<double, KerPixel>(basisKernels);
444  }
445 
446  GPU_ConvolutionImage_LC_Img(
447  inImage,
448  colPos, rowPos,
449  sFn,
450  sFnValGPUPtr, //output
451  sFnValGPU.ptr, //output
452  sfType,
453  outImage, //output
454  basisKernelsGPU.ptr,
455  kernelW, kernelH,
456  basisKernelSums, //input
457  normGPU_Owner.ptr, //output
458  doNormalize
459  );
460 
461 }
462 
463 #define INSTANTIATE_GPU_ConvolutionImage_LinearCombinationKernel(OutPixelT,InPixelT) \
464  template void GPU_ConvolutionImage_LinearCombinationKernel<OutPixelT,InPixelT>( \
465  GpuBuffer2D<InPixelT>& inImage, \
466  vector<double> colPos, \
467  vector<double> rowPos, \
468  std::vector< afwMath::Kernel::SpatialFunctionPtr > sFn, \
469  GpuBuffer2D<OutPixelT>& outImage, \
470  std::vector< GpuBuffer2D<KerPixel> >& basisKernels, \
471  SpatialFunctionType_t sfType, \
472  bool doNormalize \
473  );
474 
475 template <typename OutPixelT, typename InPixelT>
476 void GPU_ConvolutionMI_LinearCombinationKernel(
477  GpuBuffer2D<InPixelT>& inImageImg,
478  GpuBuffer2D<VarPixel>& inImageVar,
479  GpuBuffer2D<MskPixel>& inImageMsk,
480  vector<double> colPos,
481  vector<double> rowPos,
482  std::vector< afwMath::Kernel::SpatialFunctionPtr > sFn,
483  GpuBuffer2D<OutPixelT>& outImageImg,
484  GpuBuffer2D<VarPixel>& outImageVar,
485  GpuBuffer2D<MskPixel>& outImageMsk,
486  std::vector< GpuBuffer2D<KerPixel> >& basisKernels,
487  SpatialFunctionType_t sfType,
488  bool doNormalize
489 )
490 {
491  assert(basisKernels.size() == sFn.size());
492  assert(outImageImg.width == outImageVar.width);
493  assert(outImageImg.width == outImageMsk.width);
494  assert(outImageImg.height == outImageVar.height);
495  assert(outImageImg.height == outImageMsk.height);
496 
497  int outWidth = outImageImg.width;
498  int outHeight = outImageImg.height;
499 
500  const int kernelN = sFn.size();
501  const int kernelW = basisKernels[0].width;
502  const int kernelH = basisKernels[0].height;
503  const int kernelSize = kernelW * kernelH;
504 
505  for (int i = 0; i < kernelN; i++) {
506  assert(kernelW == basisKernels[i].width);
507  assert(kernelH == basisKernels[i].height);
508  }
509 
510  // transfer basis kernels to GPU
511  GpuMemOwner<KerPixel > basisKernelsGPU;
512  basisKernelsGPU.Alloc(kernelSize * kernelN);
513 
514  for (int i = 0; i < kernelN; i++) {
515  KerPixel* kernelBeg = basisKernelsGPU.ptr + (kernelSize * i);
516  CopyToGpu(kernelBeg,
517  basisKernels[i].img,
518  kernelSize
519  );
520  }
521 
522  //alloc sFn images on GPU
523  vector< double* > sFnValGPUPtr(kernelN);
524  vector< GpuMemOwner<double > > sFnValGPU_Owner(kernelN);
525 
526  for (int i = 0; i < kernelN; i++) {
527  sFnValGPUPtr[i] = sFnValGPU_Owner[i].Alloc(outWidth * outHeight);
528  if (sFnValGPUPtr[i] == NULL) {
529  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU for spatial function values");
530  }
531  }
532  GpuMemOwner<double*> sFnValGPU;
533  sFnValGPU.TransferVec(sFnValGPUPtr);
534 
535  //allocate normalization coeficients image on GPU
536  GpuMemOwner<double > normGPU_Owner;
537  std::vector<KerPixel> basisKernelSums(kernelN);
538  if (doNormalize) {
539  //allocate normalization coeficients
540  normGPU_Owner.Alloc(outWidth * outHeight);
541  if (normGPU_Owner.ptr == NULL) {
542  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU for normalization coefficients");
543  }
544  //calculate basis kernel sums
545  basisKernelSums = SumsOfImages<double, KerPixel>(basisKernels);
546  }
547 
548  GPU_ConvolutionImage_LC_Img(
549  inImageImg,
550  colPos, rowPos,
551  sFn,
552  sFnValGPUPtr, //output
553  sFnValGPU.ptr, //output
554  sfType,
555  outImageImg, //output
556  basisKernelsGPU.ptr,
557  kernelW, kernelH,
558  basisKernelSums, //input
559  normGPU_Owner.ptr, //output
560  doNormalize
561  );
562 
563  //transfer input image planes to GPU
564  GpuMemOwner<VarPixel> inImageGPUVar;
565  inImageGPUVar.Transfer(inImageVar);
566  if (inImageGPUVar.ptr == NULL) {
567  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU for input variance");
568  }
569  GpuMemOwner<MskPixel> inImageGPUMsk;
570  inImageGPUMsk.Transfer(inImageMsk);
571  if (inImageGPUMsk.ptr == NULL) {
572  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU for input mask");
573  }
574 
575  // allocate output image planes on GPU
576  GpuMemOwner<VarPixel > outImageGPUVar;
577  outImageGPUVar.Alloc( outImageVar.Size());
578  if (outImageGPUVar.ptr == NULL) {
579  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU for output variance");
580  }
581  GpuMemOwner<MskPixel > outImageGPUMsk;
582  outImageGPUMsk.Alloc( outImageMsk.Size());
583  if (outImageGPUMsk.ptr == NULL) {
584  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU for output mask");
585  }
586  int shMemSize = GetCudaCurSMSharedMemorySize() - shMemBytesUsed;
587  int blockN = CalcBlockCount( GetCudaCurSMCount());
588 
589  cudaGetLastError(); //clear error status
590  mathDetailGpu::Call_ConvolutionKernel_LC_Var(
591  inImageGPUVar.ptr, inImageVar.width, inImageVar.height,
592  inImageGPUMsk.ptr,
593  basisKernelsGPU.ptr, kernelN,
594  kernelW, kernelH,
595  sFnValGPU.ptr,
596  normGPU_Owner.ptr,
597  outImageGPUVar.ptr,
598  outImageGPUMsk.ptr,
599  blockN,
600  shMemSize
601  );
602  cudaThreadSynchronize();
603  if (cudaGetLastError() != cudaSuccess)
604  throw LSST_EXCEPT(GpuRuntimeError, "GPU calculation failed to run");
605 
606  CopyFromGpu(outImageVar.img, outImageGPUVar.ptr, outImageVar.Size() );
607  CopyFromGpu(outImageMsk.img, outImageGPUMsk.ptr, outImageMsk.Size() );
608 }
609 
610 #define INSTANTIATE_GPU_ConvolutionMI_LinearCombinationKernel(OutPixelT,InPixelT) \
611  template void GPU_ConvolutionMI_LinearCombinationKernel<OutPixelT,InPixelT>( \
612  GpuBuffer2D<InPixelT>& inImageImg, \
613  GpuBuffer2D<VarPixel>& inImageVar, \
614  GpuBuffer2D<MskPixel>& inImageMsk, \
615  vector<double> colPos, \
616  vector<double> rowPos, \
617  std::vector< afwMath::Kernel::SpatialFunctionPtr > sFn, \
618  GpuBuffer2D<OutPixelT>& outImageImg, \
619  GpuBuffer2D<VarPixel>& outImageVar, \
620  GpuBuffer2D<MskPixel>& outImageMsk, \
621  std::vector< GpuBuffer2D<KerPixel> >& basisKernels, \
622  SpatialFunctionType_t sfType, \
623  bool doNormalize \
624  );
625 
626 
627 template <typename OutPixelT, typename InPixelT>
628 void GPU_ConvolutionImage_SpatiallyInvariantKernel(
629  GpuBuffer2D<InPixelT>& inImage,
630  GpuBuffer2D<OutPixelT>& outImage,
631  GpuBuffer2D<KerPixel>& kernel
632 )
633 {
634  int kernelW = kernel.width;
635  int kernelH = kernel.height;
636 
637  GpuMemOwner<InPixelT> inImageGPU;
638  inImageGPU.Transfer(inImage);
639  if (inImageGPU.ptr == NULL) {
640  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU for input image");
641  }
642  int shMemSize = GetCudaCurSMSharedMemorySize() - shMemBytesUsed;
643 
644  // allocate array of kernels on GPU
645  GpuMemOwner<KerPixel > basisKernelGPU;
646  basisKernelGPU.Transfer(kernel);
647  if (basisKernelGPU.ptr == NULL) {
648  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU available for kernel");
649  }
650  // allocate array of output images on GPU (one output image per kernel)
651  vector< OutPixelT* > outImageGPUPtr(1);
652  vector< GpuMemOwner<OutPixelT> > outImageGPU_Owner(1);
653 
654  outImageGPUPtr[0] = outImageGPU_Owner[0].Alloc( outImage.Size());
655  if (outImageGPUPtr[0] == NULL) {
656  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU available for output image");
657  }
658  GpuMemOwner<OutPixelT*> outImageGPU;
659  outImageGPU.TransferVec(outImageGPUPtr);
660 
661  int blockN = CalcBlockCount( GetCudaCurSMCount());
662 
663  cudaGetLastError(); //clear error status
664  mathDetailGpu::Call_SpatiallyInvariantImageConvolutionKernel<OutPixelT, InPixelT>(
665  inImageGPU.ptr, inImage.width, inImage.height,
666  basisKernelGPU.ptr, 1,
667  kernelW, kernelH,
668  outImageGPU.ptr,
669  blockN,
670  shMemSize
671  );
672  cudaThreadSynchronize();
673  if (cudaGetLastError() != cudaSuccess) {
674  throw LSST_EXCEPT(GpuRuntimeError, "GPU calculation failed to run");
675  }
676  CopyFromGpu(outImage.img, outImageGPUPtr[0], outImage.Size() );
677 }
678 
679 #define INSTANTIATE_GPU_ConvolutionImage_SpatiallyInvariantKernel(OutPixelT,InPixelT) \
680  template void GPU_ConvolutionImage_SpatiallyInvariantKernel<OutPixelT,InPixelT>( \
681  GpuBuffer2D<InPixelT>& inImage, \
682  GpuBuffer2D<OutPixelT>& outImage, \
683  GpuBuffer2D<KerPixel>& kernel \
684  );
685 
686 template <typename OutPixelT, typename InPixelT>
687 void GPU_ConvolutionMI_SpatiallyInvariantKernel(
688  GpuBuffer2D<InPixelT>& inImageImg,
689  GpuBuffer2D<VarPixel>& inImageVar,
690  GpuBuffer2D<MskPixel>& inImageMsk,
691  GpuBuffer2D<OutPixelT>& outImageImg,
692  GpuBuffer2D<VarPixel>& outImageVar,
693  GpuBuffer2D<MskPixel>& outImageMsk,
694  GpuBuffer2D<KerPixel>& kernel
695 )
696 {
697  int kernelW = kernel.width;
698  int kernelH = kernel.height;
699 
700  GpuMemOwner<InPixelT> inImageGPUImg;
701  inImageGPUImg.Transfer(inImageImg);
702  if (inImageGPUImg.ptr == NULL) {
703  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU available for input image");
704  }
705  GpuMemOwner<VarPixel> inImageGPUVar;
706  inImageGPUVar.Transfer(inImageVar);
707  if (inImageGPUVar.ptr == NULL) {
708  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU available for input variance");
709  }
710  GpuMemOwner<MskPixel> inImageGPUMsk;
711  inImageGPUMsk.Transfer(inImageMsk);
712  if (inImageGPUMsk.ptr == NULL) {
713  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU available for input mask");
714  }
715  int shMemSize = GetCudaCurSMSharedMemorySize() - shMemBytesUsed;
716 
717  //allocate kernel on GPU
718  GpuMemOwner<KerPixel > basisKernelGPU;
719  basisKernelGPU.Transfer(kernel);
720  if (basisKernelGPU.ptr == NULL)
721  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU available for kernel");
722 
723  // allocate array of output image planes on GPU
724  vector< OutPixelT* > outImageGPUPtrImg(1);
725  vector< VarPixel* > outImageGPUPtrVar(1);
726  vector< MskPixel* > outImageGPUPtrMsk(1);
727 
728  vector< GpuMemOwner<OutPixelT> > outImageGPU_OwnerImg(1);
729  vector< GpuMemOwner<VarPixel > > outImageGPU_OwnerVar(1);
730  vector< GpuMemOwner<MskPixel > > outImageGPU_OwnerMsk(1);
731 
732  outImageGPUPtrImg[0] = outImageGPU_OwnerImg[0].Alloc( outImageImg.Size());
733  if (outImageGPUPtrImg[0] == NULL) {
734  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU available for output image");
735  }
736  outImageGPUPtrVar[0] = outImageGPU_OwnerVar[0].Alloc( outImageVar.Size());
737  if (outImageGPUPtrVar[0] == NULL) {
738  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU available for output variance");
739  }
740  outImageGPUPtrMsk[0] = outImageGPU_OwnerMsk[0].Alloc( outImageMsk.Size());
741  if (outImageGPUPtrMsk[0] == NULL) {
742  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU available for output mask");
743  }
744 
745  GpuMemOwner<OutPixelT*> outImageGPUImg;
746  outImageGPUImg.TransferVec(outImageGPUPtrImg);
747  GpuMemOwner<VarPixel*> outImageGPUVar;
748  outImageGPUVar.TransferVec(outImageGPUPtrVar);
749  GpuMemOwner<MskPixel*> outImageGPUMsk;
750  outImageGPUMsk.TransferVec(outImageGPUPtrMsk);
751 
752  int blockN = CalcBlockCount( GetCudaCurSMCount());
753 
754  mathDetailGpu::Call_SpatiallyInvariantImageConvolutionKernel<OutPixelT, InPixelT>(
755  inImageGPUImg.ptr, inImageImg.width, inImageImg.height,
756  basisKernelGPU.ptr, 1,
757  kernelW, kernelH,
758  outImageGPUImg.ptr,
759  blockN,
760  shMemSize
761  );
762  //square kernel
763  for (int y = 0; y < kernelH; y++) {
764  for (int x = 0; x < kernelW; x++) {
765  kernel.Pixel(x, y) *= kernel.Pixel(x, y);
766  }
767  }
768 
769  CopyFromGpu(outImageImg.img, outImageGPUPtrImg[0], outImageImg.Size() );
770 
771  basisKernelGPU.CopyToGpu(kernel);
772 
773  cudaGetLastError(); //clear last error
774 
775  mathDetailGpu::Call_SpatiallyInvariantImageConvolutionKernel<VarPixel, VarPixel>(
776  inImageGPUVar.ptr, inImageVar.width, inImageVar.height,
777  basisKernelGPU.ptr, 1,
778  kernelW, kernelH,
779  outImageGPUVar.ptr,
780  blockN,
781  shMemSize
782  );
783 
784  cudaThreadSynchronize();
785  if (cudaGetLastError() != cudaSuccess) {
786  throw LSST_EXCEPT(GpuRuntimeError, "GPU variance calculation failed to run");
787  }
788  mathDetailGpu::Call_SpatiallyInvariantMaskConvolutionKernel(
789  inImageGPUMsk.ptr, inImageMsk.width, inImageMsk.height,
790  basisKernelGPU.ptr, 1,
791  kernelW, kernelH,
792  outImageGPUMsk.ptr,
793  blockN,
794  shMemSize
795  );
796  cudaThreadSynchronize();
797  if (cudaGetLastError() != cudaSuccess) {
798  throw LSST_EXCEPT(GpuRuntimeError, "GPU mask calculation failed to run");
799  }
800  CopyFromGpu(outImageVar.img, outImageGPUPtrVar[0], outImageVar.Size() );
801  CopyFromGpu(outImageMsk.img, outImageGPUPtrMsk[0], outImageMsk.Size() );
802 }
803 
804 #define INSTANTIATE_GPU_ConvolutionMI_SpatiallyInvariantKernel(OutPixelT,InPixelT) \
805  template void GPU_ConvolutionMI_SpatiallyInvariantKernel<OutPixelT,InPixelT>( \
806  GpuBuffer2D<InPixelT>& inImageImg, \
807  GpuBuffer2D<VarPixel>& inImageVar, \
808  GpuBuffer2D<MskPixel>& inImageMsk, \
809  GpuBuffer2D<OutPixelT>& outImageImg, \
810  GpuBuffer2D<VarPixel>& outImageVar, \
811  GpuBuffer2D<MskPixel>& outImageMsk, \
812  GpuBuffer2D<KerPixel>& kernel \
813  );
814 
815 /*
816  * Explicit instantiation
817  */
819 
820 #define INSTANTIATE(OutPixelT,InPixelT) \
821  INSTANTIATE_GPU_ConvolutionImage_LinearCombinationKernel(OutPixelT,InPixelT) \
822  INSTANTIATE_GPU_ConvolutionMI_LinearCombinationKernel(OutPixelT,InPixelT) \
823  INSTANTIATE_GPU_ConvolutionImage_SpatiallyInvariantKernel(OutPixelT,InPixelT) \
824  INSTANTIATE_GPU_ConvolutionMI_SpatiallyInvariantKernel(OutPixelT,InPixelT)
825 
826 
827 INSTANTIATE(double, double)
828 INSTANTIATE(double, float)
829 INSTANTIATE(double, int)
830 INSTANTIATE(double, boost::uint16_t)
831 INSTANTIATE(float, float)
832 INSTANTIATE(float, int)
833 INSTANTIATE(float, boost::uint16_t)
834 INSTANTIATE(int, int)
835 INSTANTIATE(boost::uint16_t, boost::uint16_t)
837 
838 }
839 }
840 }
841 } //namespace lsst::afw::math::detail ends
842 
843 #endif //GPU_BUILD
844 
845 
846 
int y
2-dimensional weighted sum of Chebyshev polynomials of the first kind.
Convolution support.
Declare the Kernel class and subclasses.
Class for representing an image or 2D array in general)
Definition: GpuBuffer2D.h:54
bool IsSufficientSharedMemoryAvailable_ForImgBlock(int filterW, int filterH, int pixSize)
int getOrder() const
Get the polynomial order.
Definition: Function.h:422
2-dimensional polynomial function with cross terms
lsst::afw::geom::Box2D getXYRange() const
Get x,y range.
Define a collection of useful Functions.
contains GpuBuffer2D class (for simple handling of images or 2D arrays)
boost::enable_if< typename ExpressionTraits< Scalar >::IsScalar, Scalar >::type sum(Scalar const &scalar)
Definition: operators.h:1250
table::Key< table::Array< Kernel::Pixel > > image
Definition: FixedKernel.cc:117
int GetCudaCurSMSharedMemorySize()
returns shared memory size per block of currently selected cuda device
GPU convolution code.
Set up for convolution, calls GPU convolution kernels.
double w
Definition: CoaddPsf.cc:57
void TestGpuKernel(int &ret1, int &ret2)
bool IsSufficientSharedMemoryAvailable_ForSfn(int order, int kernelN)
int x
Functions to help managing setup for GPU kernels.
int GetCudaCurSMCount()
returns the number of streaming multiprocessors of currently selected cuda device ...
#define LSST_EXCEPT(type,...)
Definition: Exception.h:46
#define INSTANTIATE(T)
A floating-point coordinate rectangle geometry.
Definition: Box.h:271
Functions to query the properties of currently selected GPU device.
PixelT & Pixel(int x, int y)
Definition: GpuBuffer2D.h:133
Functions and a class to help allocating GPU global memory and transferring data to and from a GPU...
Definition of default types for Masks and Variance Images.
lsst::afw::image::MaskPixel MskPixel
Definition: convCUDA.h:45
bool IsSufficientSharedMemoryAvailable_ForImgAndMaskBlock(int filterW, int filterH, int pixSize)