LSSTApplications  10.0+286,10.0+36,10.0+46,10.0-2-g4f67435,10.1+152,10.1+37,11.0,11.0+1,11.0-1-g47edd16,11.0-1-g60db491,11.0-1-g7418c06,11.0-2-g04d2804,11.0-2-g68503cd,11.0-2-g818369d,11.0-2-gb8b8ce7
LSSTDataManagementBasePackage
cudaConvWrapper.cc
Go to the documentation of this file.
1 // -*- LSST-C++ -*-
2 
3 /*
4  * LSST Data Management System
5  * Copyright 2008, 2009, 2010 LSST Corporation.
6  *
7  * This product includes software developed by the
8  * LSST Project (http://www.lsst.org/).
9  *
10  * This program is free software: you can redistribute it and/or modify
11  * it under the terms of the GNU General Public License as published by
12  * the Free Software Foundation, either version 3 of the License, or
13  * (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18  * GNU General Public License for more details.
19  *
20  * You should have received a copy of the LSST License Statement and
21  * the GNU General Public License along with this program. If not,
22  * see <http://www.lsstcorp.org/LegalNotices/>.
23  */
24 
38 #ifndef GPU_BUILD
39 
40 namespace lsst {
41 namespace afw {
42 namespace math {
43 namespace detail {
44 
45 
46 void TestGpuKernel(int& ret1, int& ret2) {
47  ret1 = 0;
48  ret2 = 0;
49 }
50 
51 bool IsSufficientSharedMemoryAvailable_ForImgBlock(int filterW, int filterH, int pixSize)
52 {
53  return false;
54 }
55 bool IsSufficientSharedMemoryAvailable_ForImgAndMaskBlock(int filterW, int filterH, int pixSize)
56 {
57  return false;
58 }
59 bool IsSufficientSharedMemoryAvailable_ForSfn(int order, int kernelN)
60 {
61  return false;
62 }
63 
64 }
65 }
66 }
67 }
68 
69 #else
70 
71 #include <cuda.h>
72 #include <cuda_runtime.h>
73 #include <stdio.h>
74 
76 #include "lsst/afw/math/Kernel.h"
78 
86 
87 using namespace std;
88 using namespace lsst::afw::gpu;
89 using namespace lsst::afw::gpu::detail;
90 namespace afwImage = lsst::afw::image;
91 namespace afwMath = lsst::afw::math;
92 namespace mathDetailGpu = lsst::afw::math::detail::gpu;
93 
94 
95 
96 namespace lsst {
97 namespace afw {
98 namespace math {
99 namespace detail {
100 
101 namespace {
102 const int shMemBytesUsed = 200;
103 }
104 
105 
106 // Returns true if there is sufficient shared memory for loading an image block,
107 // where image block includes including filter frame.
108 bool IsSufficientSharedMemoryAvailable_ForImgBlock(int filterW, int filterH, int pixSize)
109 {
110  int shMemSize = GetCudaCurSMSharedMemorySize();
111  int bufferSize = (filterW + blockSizeX - 1) * (filterH + blockSizeY - 1) * pixSize;
112 
113  return shMemSize - shMemBytesUsed - bufferSize > 0;
114 }
115 
116 // Returns true if there is sufficient shared memory for loading an image block,
117 // and acommpanying block of mask data (mask image block),
118 // where image block and mask image block include including filter frame.
119 bool IsSufficientSharedMemoryAvailable_ForImgAndMaskBlock(int filterW, int filterH, int pixSize)
120 {
121  int shMemSize = GetCudaCurSMSharedMemorySize();
122  int imgBufferSize = (filterW + blockSizeX - 1) * (filterH + blockSizeY - 1) * pixSize;
123  int mskBufferSize = (filterW + blockSizeX - 1) * (filterH + blockSizeY - 1) * sizeof(MskPixel);
124 
125  int memRemaining = shMemSize - shMemBytesUsed - imgBufferSize - mskBufferSize ;
126 
127  return memRemaining > 0;
128 }
129 
130 // Returns true if there is sufficient shared memory for loading
131 // parameters and temporary values for Chebyshev function and normalization
132 bool IsSufficientSharedMemoryAvailable_ForSfn(int order, int kernelN)
133 {
134  int shMemSize = GetCudaCurSMSharedMemorySize();
135 
136  const int coeffN = order + 1;
137  const int coeffPadding = coeffN + 1 - (coeffN % 2);
138  int paramN = (order + 1) * (order + 2) / 2;
139 
140  int yCoeffsAll = coeffPadding * blockSizeX * sizeof(double);
141  int xChebyAll = coeffPadding * blockSizeX * sizeof(double);
142  int smemParams = paramN * sizeof(double);
143 
144  int smemKernelSum = kernelN * sizeof(double);
145  int smemSfnPtr = kernelN * sizeof(double*);
146 
147  int memRemainingSfn = shMemSize - shMemBytesUsed - yCoeffsAll - xChebyAll - smemParams ;
148  int memRemainingNorm = shMemSize - shMemBytesUsed - smemKernelSum - smemSfnPtr;
149 
150  return min(memRemainingSfn, memRemainingNorm) > 0;
151 }
152 
153 // This function decides on the best GPU block count
154 // uses simple heuristics (not to much blocks and not too many)
155 // but guarantees that number of blocks will be a multiple of number of multiprocessors
156 int CalcBlockCount(int multiprocCount)
157 {
158  if (multiprocCount < 12) return multiprocCount * 4;
159  if (multiprocCount < 24) return multiprocCount * 2;
160  return multiprocCount;
161 }
162 
163 
164 // calls test gpu kernel
165 // should return 5 and 8 in ret1 and ret2
166 void TestGpuKernel(int& ret1, int& ret2)
167 {
168  int res[2];
169 
170  GpuMemOwner<int> resGpu;
171  resGpu.Alloc(2);
172 
173  gpu::CallTestGpuKernel(resGpu.ptr);
174 
175  resGpu.CopyFromGpu(res);
176 
177  ret1 = res[0];
178  ret2 = res[1];
179 }
180 
181 namespace {
182 
183 //calculates sum of each image in 'images' vector
184 template <typename ResultT, typename InT>
185 vector<ResultT> SumsOfImages(const vector< GpuBuffer2D<InT> >& images)
186 {
187  int n = int(images.size());
188  vector<ResultT> sum(n);
189  for (int i = 0; i < n; i++) {
190  ResultT totalSum = 0;
191  int h = images[i].height;
192  int w = images[i].width;
193 
194  for (int y = 0; y < h; y++) {
195  ResultT rowSum = 0;
196  for (int x = 0; x < w; x++) {
197  rowSum += images[i].Pixel(x, y);
198  }
199  totalSum += rowSum;
200  }
201  sum[i] = totalSum;
202  }
203  return sum;
204 }
205 
220 template <typename OutPixelT, typename InPixelT>
221 void GPU_ConvolutionImage_LC_Img(
222  const GpuBuffer2D<InPixelT>& inImage,
223  const vector<double>& colPos,
224  const vector<double>& rowPos,
225  const std::vector< afwMath::Kernel::SpatialFunctionPtr >& sFn,
226  const vector<double*>& sFnValGPUPtr, //output
227  double** sFnValGPU, //output
228  SpatialFunctionType_t sfType,
229  GpuBuffer2D<OutPixelT>& outImage, //output
230  KerPixel* basisKernelsListGPU,
231  int kernelW, int kernelH,
232  const vector<double>& basisKernelSums, //input
233  double* normGPU, //output
234  bool doNormalize
235 )
236 {
237  const int kernelN = sFn.size();
238 
239  //transfer input image
240  GpuMemOwner<InPixelT > inImageGPU;
241  inImageGPU.Transfer(inImage);
242  if (inImageGPU.ptr == NULL) {
243  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU for input image");
244  }
245  // allocate output image planes on GPU
246  GpuMemOwner<OutPixelT> outImageGPU;
247  outImageGPU.Alloc( outImage.Size());
248  if (outImageGPU.ptr == NULL) {
249  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU for output image");
250  }
251  //transfer coordinate tranform data
252  GpuMemOwner<double> colPosGPU_Owner;
253  colPosGPU_Owner.TransferVec(colPos);
254  if (colPosGPU_Owner.ptr == NULL) {
255  throw LSST_EXCEPT(GpuMemoryError,
256  "Not enough memory on GPU for row coordinate tranformation data");
257  }
258  GpuMemOwner<double> rowPosGPU_Owner;
259  rowPosGPU_Owner.TransferVec(rowPos);
260  if (rowPosGPU_Owner.ptr == NULL) {
261  throw LSST_EXCEPT(GpuMemoryError,
262  "Not enough memory on GPU for column coordinate tranformation data");
263  }
264  vector< double* > sFnParamsGPUPtr(kernelN);
265  vector< GpuMemOwner<double> > sFnParamsGPU_Owner(kernelN);
266 
267  //transfer sfn parameters to GPU
268  for (int i = 0; i < kernelN; i++) {
269  std::vector<double> spatialParams = sFn[i]->getParameters();
270  sFnParamsGPUPtr[i] = sFnParamsGPU_Owner[i].TransferVec(spatialParams);
271  if (sFnParamsGPUPtr[i] == NULL) {
272  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU for spatial function parameters");
273  }
274  }
275 
276  int shMemSize = GetCudaCurSMSharedMemorySize() - shMemBytesUsed;
277 
278  for (int i = 0; i < kernelN; i++)
279  {
280  cudaGetLastError(); //clear error status
281 
282  if (sfType == sftPolynomial) {
283  const afwMath::PolynomialFunction2<double>* polySfn =
284  dynamic_cast<const afwMath::PolynomialFunction2<double>*>( sFn[i].get() );
285 
286  gpu::Call_PolynomialImageValues(
287  sFnValGPUPtr[i], outImage.width, outImage.height,
288  polySfn->getOrder(),
289  sFnParamsGPU_Owner[i].ptr,
290  rowPosGPU_Owner.ptr,
291  colPosGPU_Owner.ptr,
292  shMemSize
293  );
294 
295  //cudaThreadSynchronize();
296  cudaError_t cuda_err = cudaGetLastError();
297  if (cuda_err != cudaSuccess) {
298  throw LSST_EXCEPT(GpuRuntimeError, "GPU calculation failed to run");
299  }
300  }
301  if (sfType == sftChebyshev) {
302  const afwMath::Chebyshev1Function2<double>* chebSfn =
303  dynamic_cast<const afwMath::Chebyshev1Function2<double>*>( sFn[i].get() );
304 
305  lsst::afw::geom::Box2D const xyRange = chebSfn->getXYRange();
306 
307  gpu::Call_ChebyshevImageValues(
308  sFnValGPUPtr[i], outImage.width, outImage.height,
309  chebSfn->getOrder(),
310  sFnParamsGPU_Owner[i].ptr,
311  rowPosGPU_Owner.ptr,
312  colPosGPU_Owner.ptr,
313  xyRange.getMinX(), xyRange.getMinY(), xyRange.getMaxX(), xyRange.getMaxY(),
314  shMemSize
315  );
316 
317  //cudaThreadSynchronize();
318  cudaError_t cuda_err = cudaGetLastError();
319  if (cuda_err != cudaSuccess) {
320  throw LSST_EXCEPT(GpuRuntimeError, "GPU calculation failed to run");
321  }
322  }
323  }
324  cudaThreadSynchronize();
325  cudaError_t cuda_err = cudaGetLastError();
326  if (cuda_err != cudaSuccess) {
327  throw LSST_EXCEPT(GpuRuntimeError, "GPU calculation failed to run");
328  }
329 
330  int blockN = CalcBlockCount( GetCudaCurSMCount());
331 
332  //transfer basis kernel sums
333  if (doNormalize) {
334  GpuMemOwner<double> basisKernelSumsGPU;
335  basisKernelSumsGPU.TransferVec(basisKernelSums);
336 
337  bool isDivideByZero = false;
338  GpuMemOwner<bool> isDivideByZeroGPU;
339  isDivideByZeroGPU.Transfer(&isDivideByZero, 1);
340 
341  gpu::Call_NormalizationImageValues(
342  normGPU, outImage.width, outImage.height,
343  sFnValGPU, kernelN,
344  basisKernelSumsGPU.ptr,
345  isDivideByZeroGPU.ptr,
346  blockN,
347  shMemSize
348  );
349  cudaThreadSynchronize();
350  if (cudaGetLastError() != cudaSuccess) {
351  throw LSST_EXCEPT(GpuRuntimeError, "GPU calculation failed to run");
352  }
353  CopyFromGpu<bool>(&isDivideByZero, isDivideByZeroGPU.ptr, 1);
354  if (isDivideByZero) {
355  throw LSST_EXCEPT(pexExcept::OverflowError, "Cannot normalize; kernel sum is 0");
356  }
357  }
358 
359  cudaGetLastError(); //clear error status
360  mathDetailGpu::Call_ConvolutionKernel_LC_Img(
361  inImageGPU.ptr, inImage.width, inImage.height,
362  basisKernelsListGPU, kernelN,
363  kernelW, kernelH,
364  &sFnValGPU[0],
365  normGPU,
366  outImageGPU.ptr,
367  blockN,
368  shMemSize
369  );
370  cudaThreadSynchronize();
371  if (cudaGetLastError() != cudaSuccess) {
372  throw LSST_EXCEPT(GpuRuntimeError, "GPU calculation failed to run");
373  }
374 
375  CopyFromGpu(outImage.img, outImageGPU.ptr, outImage.Size() );
376 
377 }
378 
379 } //local namespace ends
380 
381 template <typename OutPixelT, typename InPixelT>
382 void GPU_ConvolutionImage_LinearCombinationKernel(
383  GpuBuffer2D<InPixelT>& inImage,
384  vector<double> colPos,
385  vector<double> rowPos,
386  std::vector< afwMath::Kernel::SpatialFunctionPtr > sFn,
387  GpuBuffer2D<OutPixelT>& outImage,
388  std::vector< GpuBuffer2D<KerPixel> >& basisKernels,
389  SpatialFunctionType_t sfType,
390  bool doNormalize
391 )
392 {
393  assert(basisKernels.size() == sFn.size());
394 
395  int outWidth = outImage.width;
396  int outHeight = outImage.height;
397 
398  const int kernelN = sFn.size();
399  const int kernelW = basisKernels[0].width;
400  const int kernelH = basisKernels[0].height;
401  const int kernelSize = kernelW * kernelH;
402 
403  for (int i = 0; i < kernelN; i++) {
404  assert(kernelW == basisKernels[i].width);
405  assert(kernelH == basisKernels[i].height);
406  }
407 
408  // transfer array of basis kernels on GPU
409  GpuMemOwner<KerPixel > basisKernelsGPU;
410  basisKernelsGPU.Alloc(kernelSize * kernelN);
411 
412  for (int i = 0; i < kernelN; i++) {
413  KerPixel* kernelBeg = basisKernelsGPU.ptr + (kernelSize * i);
414  CopyToGpu(kernelBeg,
415  basisKernels[i].img,
416  kernelSize
417  );
418  }
419 
420  // allocate array of spatial function value images on GPU
421  vector< double* > sFnValGPUPtr(kernelN);
422  vector< GpuMemOwner<double > > sFnValGPU_Owner(kernelN);
423 
424  for (int i = 0; i < kernelN; i++) {
425  sFnValGPUPtr[i] = sFnValGPU_Owner[i].Alloc(outWidth * outHeight);
426  if (sFnValGPUPtr[i] == NULL) {
427  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU for spatial function values");
428  }
429  }
430  GpuMemOwner<double*> sFnValGPU;
431  sFnValGPU.TransferVec(sFnValGPUPtr);
432 
433  GpuMemOwner<double > normGPU_Owner;
434  vector<double> basisKernelSums(kernelN);
435  if (doNormalize) {
436  //allocate normalization coeficients
437  normGPU_Owner.Alloc(outWidth * outHeight);
438  if (normGPU_Owner.ptr == NULL) {
439  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU for normalization coefficients");
440  }
441 
442  //calculate basis kernel sums
443  basisKernelSums = SumsOfImages<double, KerPixel>(basisKernels);
444  }
445 
446  GPU_ConvolutionImage_LC_Img(
447  inImage,
448  colPos, rowPos,
449  sFn,
450  sFnValGPUPtr, //output
451  sFnValGPU.ptr, //output
452  sfType,
453  outImage, //output
454  basisKernelsGPU.ptr,
455  kernelW, kernelH,
456  basisKernelSums, //input
457  normGPU_Owner.ptr, //output
458  doNormalize
459  );
460 
461 }
462 
463 #define INSTANTIATE_GPU_ConvolutionImage_LinearCombinationKernel(OutPixelT,InPixelT) \
464  template void GPU_ConvolutionImage_LinearCombinationKernel<OutPixelT,InPixelT>( \
465  GpuBuffer2D<InPixelT>& inImage, \
466  vector<double> colPos, \
467  vector<double> rowPos, \
468  std::vector< afwMath::Kernel::SpatialFunctionPtr > sFn, \
469  GpuBuffer2D<OutPixelT>& outImage, \
470  std::vector< GpuBuffer2D<KerPixel> >& basisKernels, \
471  SpatialFunctionType_t sfType, \
472  bool doNormalize \
473  );
474 
475 template <typename OutPixelT, typename InPixelT>
476 void GPU_ConvolutionMI_LinearCombinationKernel(
477  GpuBuffer2D<InPixelT>& inImageImg,
478  GpuBuffer2D<VarPixel>& inImageVar,
479  GpuBuffer2D<MskPixel>& inImageMsk,
480  vector<double> colPos,
481  vector<double> rowPos,
482  std::vector< afwMath::Kernel::SpatialFunctionPtr > sFn,
483  GpuBuffer2D<OutPixelT>& outImageImg,
484  GpuBuffer2D<VarPixel>& outImageVar,
485  GpuBuffer2D<MskPixel>& outImageMsk,
486  std::vector< GpuBuffer2D<KerPixel> >& basisKernels,
487  SpatialFunctionType_t sfType,
488  bool doNormalize
489 )
490 {
491  assert(basisKernels.size() == sFn.size());
492  assert(outImageImg.width == outImageVar.width);
493  assert(outImageImg.width == outImageMsk.width);
494  assert(outImageImg.height == outImageVar.height);
495  assert(outImageImg.height == outImageMsk.height);
496 
497  int outWidth = outImageImg.width;
498  int outHeight = outImageImg.height;
499 
500  const int kernelN = sFn.size();
501  const int kernelW = basisKernels[0].width;
502  const int kernelH = basisKernels[0].height;
503  const int kernelSize = kernelW * kernelH;
504 
505  for (int i = 0; i < kernelN; i++) {
506  assert(kernelW == basisKernels[i].width);
507  assert(kernelH == basisKernels[i].height);
508  }
509 
510  // transfer basis kernels to GPU
511  GpuMemOwner<KerPixel > basisKernelsGPU;
512  basisKernelsGPU.Alloc(kernelSize * kernelN);
513 
514  for (int i = 0; i < kernelN; i++) {
515  KerPixel* kernelBeg = basisKernelsGPU.ptr + (kernelSize * i);
516  CopyToGpu(kernelBeg,
517  basisKernels[i].img,
518  kernelSize
519  );
520  }
521 
522  //alloc sFn images on GPU
523  vector< double* > sFnValGPUPtr(kernelN);
524  vector< GpuMemOwner<double > > sFnValGPU_Owner(kernelN);
525 
526  for (int i = 0; i < kernelN; i++) {
527  sFnValGPUPtr[i] = sFnValGPU_Owner[i].Alloc(outWidth * outHeight);
528  if (sFnValGPUPtr[i] == NULL) {
529  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU for spatial function values");
530  }
531  }
532  GpuMemOwner<double*> sFnValGPU;
533  sFnValGPU.TransferVec(sFnValGPUPtr);
534 
535  //allocate normalization coeficients image on GPU
536  GpuMemOwner<double > normGPU_Owner;
537  std::vector<KerPixel> basisKernelSums(kernelN);
538  if (doNormalize) {
539  //allocate normalization coeficients
540  normGPU_Owner.Alloc(outWidth * outHeight);
541  if (normGPU_Owner.ptr == NULL) {
542  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU for normalization coefficients");
543  }
544  //calculate basis kernel sums
545  basisKernelSums = SumsOfImages<double, KerPixel>(basisKernels);
546  }
547 
548  GPU_ConvolutionImage_LC_Img(
549  inImageImg,
550  colPos, rowPos,
551  sFn,
552  sFnValGPUPtr, //output
553  sFnValGPU.ptr, //output
554  sfType,
555  outImageImg, //output
556  basisKernelsGPU.ptr,
557  kernelW, kernelH,
558  basisKernelSums, //input
559  normGPU_Owner.ptr, //output
560  doNormalize
561  );
562 
563  //transfer input image planes to GPU
564  GpuMemOwner<VarPixel> inImageGPUVar;
565  inImageGPUVar.Transfer(inImageVar);
566  if (inImageGPUVar.ptr == NULL) {
567  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU for input variance");
568  }
569  GpuMemOwner<MskPixel> inImageGPUMsk;
570  inImageGPUMsk.Transfer(inImageMsk);
571  if (inImageGPUMsk.ptr == NULL) {
572  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU for input mask");
573  }
574 
575  // allocate output image planes on GPU
576  GpuMemOwner<VarPixel > outImageGPUVar;
577  outImageGPUVar.Alloc( outImageVar.Size());
578  if (outImageGPUVar.ptr == NULL) {
579  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU for output variance");
580  }
581  GpuMemOwner<MskPixel > outImageGPUMsk;
582  outImageGPUMsk.Alloc( outImageMsk.Size());
583  if (outImageGPUMsk.ptr == NULL) {
584  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU for output mask");
585  }
586  int shMemSize = GetCudaCurSMSharedMemorySize() - shMemBytesUsed;
587  int blockN = CalcBlockCount( GetCudaCurSMCount());
588 
589  cudaGetLastError(); //clear error status
590  mathDetailGpu::Call_ConvolutionKernel_LC_Var(
591  inImageGPUVar.ptr, inImageVar.width, inImageVar.height,
592  inImageGPUMsk.ptr,
593  basisKernelsGPU.ptr, kernelN,
594  kernelW, kernelH,
595  sFnValGPU.ptr,
596  normGPU_Owner.ptr,
597  outImageGPUVar.ptr,
598  outImageGPUMsk.ptr,
599  blockN,
600  shMemSize
601  );
602  cudaThreadSynchronize();
603  if (cudaGetLastError() != cudaSuccess)
604  throw LSST_EXCEPT(GpuRuntimeError, "GPU calculation failed to run");
605 
606  CopyFromGpu(outImageVar.img, outImageGPUVar.ptr, outImageVar.Size() );
607  CopyFromGpu(outImageMsk.img, outImageGPUMsk.ptr, outImageMsk.Size() );
608 }
609 
610 #define INSTANTIATE_GPU_ConvolutionMI_LinearCombinationKernel(OutPixelT,InPixelT) \
611  template void GPU_ConvolutionMI_LinearCombinationKernel<OutPixelT,InPixelT>( \
612  GpuBuffer2D<InPixelT>& inImageImg, \
613  GpuBuffer2D<VarPixel>& inImageVar, \
614  GpuBuffer2D<MskPixel>& inImageMsk, \
615  vector<double> colPos, \
616  vector<double> rowPos, \
617  std::vector< afwMath::Kernel::SpatialFunctionPtr > sFn, \
618  GpuBuffer2D<OutPixelT>& outImageImg, \
619  GpuBuffer2D<VarPixel>& outImageVar, \
620  GpuBuffer2D<MskPixel>& outImageMsk, \
621  std::vector< GpuBuffer2D<KerPixel> >& basisKernels, \
622  SpatialFunctionType_t sfType, \
623  bool doNormalize \
624  );
625 
626 
627 template <typename OutPixelT, typename InPixelT>
628 void GPU_ConvolutionImage_SpatiallyInvariantKernel(
629  GpuBuffer2D<InPixelT>& inImage,
630  GpuBuffer2D<OutPixelT>& outImage,
631  GpuBuffer2D<KerPixel>& kernel
632 )
633 {
634  int kernelW = kernel.width;
635  int kernelH = kernel.height;
636 
637  GpuMemOwner<InPixelT> inImageGPU;
638  inImageGPU.Transfer(inImage);
639  if (inImageGPU.ptr == NULL) {
640  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU for input image");
641  }
642  int shMemSize = GetCudaCurSMSharedMemorySize() - shMemBytesUsed;
643 
644  // allocate array of kernels on GPU
645  GpuMemOwner<KerPixel > basisKernelGPU;
646  basisKernelGPU.Transfer(kernel);
647  if (basisKernelGPU.ptr == NULL) {
648  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU available for kernel");
649  }
650  // allocate array of output images on GPU (one output image per kernel)
651  vector< OutPixelT* > outImageGPUPtr(1);
652  vector< GpuMemOwner<OutPixelT> > outImageGPU_Owner(1);
653 
654  outImageGPUPtr[0] = outImageGPU_Owner[0].Alloc( outImage.Size());
655  if (outImageGPUPtr[0] == NULL) {
656  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU available for output image");
657  }
658  GpuMemOwner<OutPixelT*> outImageGPU;
659  outImageGPU.TransferVec(outImageGPUPtr);
660 
661  int blockN = CalcBlockCount( GetCudaCurSMCount());
662 
663  cudaGetLastError(); //clear error status
664  mathDetailGpu::Call_SpatiallyInvariantImageConvolutionKernel<OutPixelT, InPixelT>(
665  inImageGPU.ptr, inImage.width, inImage.height,
666  basisKernelGPU.ptr, 1,
667  kernelW, kernelH,
668  outImageGPU.ptr,
669  blockN,
670  shMemSize
671  );
672  cudaThreadSynchronize();
673  if (cudaGetLastError() != cudaSuccess) {
674  throw LSST_EXCEPT(GpuRuntimeError, "GPU calculation failed to run");
675  }
676  CopyFromGpu(outImage.img, outImageGPUPtr[0], outImage.Size() );
677 }
678 
679 #define INSTANTIATE_GPU_ConvolutionImage_SpatiallyInvariantKernel(OutPixelT,InPixelT) \
680  template void GPU_ConvolutionImage_SpatiallyInvariantKernel<OutPixelT,InPixelT>( \
681  GpuBuffer2D<InPixelT>& inImage, \
682  GpuBuffer2D<OutPixelT>& outImage, \
683  GpuBuffer2D<KerPixel>& kernel \
684  );
685 
686 template <typename OutPixelT, typename InPixelT>
687 void GPU_ConvolutionMI_SpatiallyInvariantKernel(
688  GpuBuffer2D<InPixelT>& inImageImg,
689  GpuBuffer2D<VarPixel>& inImageVar,
690  GpuBuffer2D<MskPixel>& inImageMsk,
691  GpuBuffer2D<OutPixelT>& outImageImg,
692  GpuBuffer2D<VarPixel>& outImageVar,
693  GpuBuffer2D<MskPixel>& outImageMsk,
694  GpuBuffer2D<KerPixel>& kernel
695 )
696 {
697  int kernelW = kernel.width;
698  int kernelH = kernel.height;
699 
700  GpuMemOwner<InPixelT> inImageGPUImg;
701  inImageGPUImg.Transfer(inImageImg);
702  if (inImageGPUImg.ptr == NULL) {
703  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU available for input image");
704  }
705  GpuMemOwner<VarPixel> inImageGPUVar;
706  inImageGPUVar.Transfer(inImageVar);
707  if (inImageGPUVar.ptr == NULL) {
708  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU available for input variance");
709  }
710  GpuMemOwner<MskPixel> inImageGPUMsk;
711  inImageGPUMsk.Transfer(inImageMsk);
712  if (inImageGPUMsk.ptr == NULL) {
713  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU available for input mask");
714  }
715  int shMemSize = GetCudaCurSMSharedMemorySize() - shMemBytesUsed;
716 
717  //allocate kernel on GPU
718  GpuMemOwner<KerPixel > basisKernelGPU;
719  basisKernelGPU.Transfer(kernel);
720  if (basisKernelGPU.ptr == NULL)
721  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU available for kernel");
722 
723  // allocate array of output image planes on GPU
724  vector< OutPixelT* > outImageGPUPtrImg(1);
725  vector< VarPixel* > outImageGPUPtrVar(1);
726  vector< MskPixel* > outImageGPUPtrMsk(1);
727 
728  vector< GpuMemOwner<OutPixelT> > outImageGPU_OwnerImg(1);
729  vector< GpuMemOwner<VarPixel > > outImageGPU_OwnerVar(1);
730  vector< GpuMemOwner<MskPixel > > outImageGPU_OwnerMsk(1);
731 
732  outImageGPUPtrImg[0] = outImageGPU_OwnerImg[0].Alloc( outImageImg.Size());
733  if (outImageGPUPtrImg[0] == NULL) {
734  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU available for output image");
735  }
736  outImageGPUPtrVar[0] = outImageGPU_OwnerVar[0].Alloc( outImageVar.Size());
737  if (outImageGPUPtrVar[0] == NULL) {
738  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU available for output variance");
739  }
740  outImageGPUPtrMsk[0] = outImageGPU_OwnerMsk[0].Alloc( outImageMsk.Size());
741  if (outImageGPUPtrMsk[0] == NULL) {
742  throw LSST_EXCEPT(GpuMemoryError, "Not enough memory on GPU available for output mask");
743  }
744 
745  GpuMemOwner<OutPixelT*> outImageGPUImg;
746  outImageGPUImg.TransferVec(outImageGPUPtrImg);
747  GpuMemOwner<VarPixel*> outImageGPUVar;
748  outImageGPUVar.TransferVec(outImageGPUPtrVar);
749  GpuMemOwner<MskPixel*> outImageGPUMsk;
750  outImageGPUMsk.TransferVec(outImageGPUPtrMsk);
751 
752  int blockN = CalcBlockCount( GetCudaCurSMCount());
753 
754  mathDetailGpu::Call_SpatiallyInvariantImageConvolutionKernel<OutPixelT, InPixelT>(
755  inImageGPUImg.ptr, inImageImg.width, inImageImg.height,
756  basisKernelGPU.ptr, 1,
757  kernelW, kernelH,
758  outImageGPUImg.ptr,
759  blockN,
760  shMemSize
761  );
762  //square kernel
763  for (int y = 0; y < kernelH; y++) {
764  for (int x = 0; x < kernelW; x++) {
765  kernel.Pixel(x, y) *= kernel.Pixel(x, y);
766  }
767  }
768 
769  CopyFromGpu(outImageImg.img, outImageGPUPtrImg[0], outImageImg.Size() );
770 
771  basisKernelGPU.CopyToGpu(kernel);
772 
773  cudaGetLastError(); //clear last error
774 
775  mathDetailGpu::Call_SpatiallyInvariantImageConvolutionKernel<VarPixel, VarPixel>(
776  inImageGPUVar.ptr, inImageVar.width, inImageVar.height,
777  basisKernelGPU.ptr, 1,
778  kernelW, kernelH,
779  outImageGPUVar.ptr,
780  blockN,
781  shMemSize
782  );
783 
784  cudaThreadSynchronize();
785  if (cudaGetLastError() != cudaSuccess) {
786  throw LSST_EXCEPT(GpuRuntimeError, "GPU variance calculation failed to run");
787  }
788  mathDetailGpu::Call_SpatiallyInvariantMaskConvolutionKernel(
789  inImageGPUMsk.ptr, inImageMsk.width, inImageMsk.height,
790  basisKernelGPU.ptr, 1,
791  kernelW, kernelH,
792  outImageGPUMsk.ptr,
793  blockN,
794  shMemSize
795  );
796  cudaThreadSynchronize();
797  if (cudaGetLastError() != cudaSuccess) {
798  throw LSST_EXCEPT(GpuRuntimeError, "GPU mask calculation failed to run");
799  }
800  CopyFromGpu(outImageVar.img, outImageGPUPtrVar[0], outImageVar.Size() );
801  CopyFromGpu(outImageMsk.img, outImageGPUPtrMsk[0], outImageMsk.Size() );
802 }
803 
804 #define INSTANTIATE_GPU_ConvolutionMI_SpatiallyInvariantKernel(OutPixelT,InPixelT) \
805  template void GPU_ConvolutionMI_SpatiallyInvariantKernel<OutPixelT,InPixelT>( \
806  GpuBuffer2D<InPixelT>& inImageImg, \
807  GpuBuffer2D<VarPixel>& inImageVar, \
808  GpuBuffer2D<MskPixel>& inImageMsk, \
809  GpuBuffer2D<OutPixelT>& outImageImg, \
810  GpuBuffer2D<VarPixel>& outImageVar, \
811  GpuBuffer2D<MskPixel>& outImageMsk, \
812  GpuBuffer2D<KerPixel>& kernel \
813  );
814 
815 /*
816  * Explicit instantiation
817  */
819 
820 #define INSTANTIATE(OutPixelT,InPixelT) \
821  INSTANTIATE_GPU_ConvolutionImage_LinearCombinationKernel(OutPixelT,InPixelT) \
822  INSTANTIATE_GPU_ConvolutionMI_LinearCombinationKernel(OutPixelT,InPixelT) \
823  INSTANTIATE_GPU_ConvolutionImage_SpatiallyInvariantKernel(OutPixelT,InPixelT) \
824  INSTANTIATE_GPU_ConvolutionMI_SpatiallyInvariantKernel(OutPixelT,InPixelT)
825 
826 
827 INSTANTIATE(double, double)
828 INSTANTIATE(double, float)
829 INSTANTIATE(double, int)
830 INSTANTIATE(double, boost::uint16_t)
831 INSTANTIATE(float, float)
832 INSTANTIATE(float, int)
833 INSTANTIATE(float, boost::uint16_t)
834 INSTANTIATE(int, int)
835 INSTANTIATE(boost::uint16_t, boost::uint16_t)
837 
838 }
839 }
840 }
841 } //namespace lsst::afw::math::detail ends
842 
843 #endif //GPU_BUILD
844 
845 
846 
int y
2-dimensional weighted sum of Chebyshev polynomials of the first kind.
Convolution support.
Declare the Kernel class and subclasses.
Class for representing an image or 2D array in general)
Definition: GpuBuffer2D.h:54
bool IsSufficientSharedMemoryAvailable_ForImgBlock(int filterW, int filterH, int pixSize)
2-dimensional polynomial function with cross terms
#define INSTANTIATE(MATCH)
Define a collection of useful Functions.
contains GpuBuffer2D class (for simple handling of images or 2D arrays)
boost::enable_if< typename ExpressionTraits< Scalar >::IsScalar, Scalar >::type sum(Scalar const &scalar)
Definition: operators.h:1250
table::Key< table::Array< Kernel::Pixel > > image
Definition: FixedKernel.cc:117
int GetCudaCurSMSharedMemorySize()
returns shared memory size per block of currently selected cuda device
GPU convolution code.
Set up for convolution, calls GPU convolution kernels.
void ImageT ImageT int float saturatedPixelValue int const width
Definition: saturated.cc:44
double w
Definition: CoaddPsf.cc:57
lsst::afw::geom::Box2D getXYRange() const
Get x,y range.
int getOrder() const
Get the polynomial order.
Definition: Function.h:422
void TestGpuKernel(int &ret1, int &ret2)
double x
bool IsSufficientSharedMemoryAvailable_ForSfn(int order, int kernelN)
void ImageT ImageT int float saturatedPixelValue int const height
Definition: saturated.cc:44
Functions to help managing setup for GPU kernels.
int GetCudaCurSMCount()
returns the number of streaming multiprocessors of currently selected cuda device ...
#define LSST_EXCEPT(type,...)
Definition: Exception.h:46
PixelT & Pixel(int x, int y)
Definition: GpuBuffer2D.h:133
lsst::afw::image::MaskPixel MskPixel
Definition: convCUDA.h:45
A floating-point coordinate rectangle geometry.
Definition: Box.h:271
Functions to query the properties of currently selected GPU device.
Functions and a class to help allocating GPU global memory and transferring data to and from a GPU...
Definition of default types for Masks and Variance Images.
bool IsSufficientSharedMemoryAvailable_ForImgAndMaskBlock(int filterW, int filterH, int pixSize)