37 #include <cuda_runtime.h>
65 namespace mathDetail = lsst::afw::math::detail;
66 namespace gpuDetail = lsst::afw::gpu::detail;
67 namespace afwMath = lsst::afw::math;
68 namespace afwGpu = lsst::afw::gpu;
70 namespace pexExcept = lsst::pex::exceptions;
71 namespace afwGeom = lsst::afw::geom;
80 int CeilDivide(
int num,
int divisor)
82 return (num + divisor - 1) / divisor;
86 int InterpBlkN(
int size ,
int interpLength)
88 return CeilDivide(size , interpLength) + 1;
92 gpu::SPoint2 GetInterpolatedValue(afwGpu::detail::GpuBuffer2D<gpu::BilinearInterp>
const & interpBuf,
93 int blkX,
int blkY,
int subX,
int subY
96 gpu::BilinearInterp interp = interpBuf.Pixel(blkX, blkY);
97 return interp.Interpolate(subX, subY);
101 gpu::SPoint2 GetInterpolatedValue(afwGpu::detail::GpuBuffer2D<gpu::BilinearInterp>
const & interpBuf,
102 int interpLen,
int x,
int y
105 int blkX = x / interpLen;
106 int blkY = y / interpLen;
108 int subX = x % interpLen;
109 int subY = y % interpLen;
111 return GetInterpolatedValue(interpBuf, blkX, blkY, subX, subY);
116 int NumGoodPixels(afwGpu::detail::GpuBuffer2D<gpu::BilinearInterp>
const & interpBuf,
117 int const interpLen,
int const width,
int const height,
SBox2I srcGoodBox)
121 int subY = 1, blkY = 0;
122 for (
int row = 0;
row < height;
row++, subY++) {
123 if (subY >= interpLen) {
128 int subX = 1, blkX = 0;
129 gpu::BilinearInterp interp = interpBuf.Pixel(blkX, blkY);
130 gpu::LinearInterp lineY = interp.GetLinearInterp(subY);
132 for (
int col = 0;
col < width;
col++, subX++) {
133 if (subX >= interpLen) {
136 interp = interpBuf.Pixel(blkX, blkY);
137 lineY = interp.GetLinearInterp(subY);
139 gpu::SPoint2 srcPos = lineY.Interpolate(subX);
152 template<
typename DestPixelT,
typename SrcPixelT>
153 int WarpImageGpuWrapper(
161 int const interpLength,
167 typename DestImageT::SinglePixel
const edgePixel = padValue;
169 gpu::PixelIVM<DestPixelT> edgePixelGpu;
171 edgePixelGpu.var = -1;
172 edgePixelGpu.msk = -1;
174 int const destWidth = destImage.
getWidth();
175 int const destHeight = destImage.
getHeight();
176 gpuDetail::GpuMemOwner<DestPixelT> destBufImgGpu;
177 gpuDetail::GpuMemOwner<SrcPixelT> srcBufImgGpu;
178 gpuDetail::GpuMemOwner<gpu::BilinearInterp> srcPosInterpGpu;
180 gpu::ImageDataPtr<DestPixelT> destImgGpu;
181 destImgGpu.strideImg = destBufImgGpu.AllocImageBaseBuffer(destImage);
182 if (destBufImgGpu.ptr == NULL) {
183 throw LSST_EXCEPT(afwGpu::GpuMemoryError,
"Not enough memory on GPU for output image");
185 destImgGpu.img = destBufImgGpu.ptr;
186 destImgGpu.var = NULL;
187 destImgGpu.msk = NULL;
188 destImgGpu.width = destWidth;
189 destImgGpu.height = destHeight;
191 gpu::ImageDataPtr<SrcPixelT> srcImgGpu;
192 srcImgGpu.strideImg = srcBufImgGpu.TransferFromImageBase(srcImage);
193 if (srcBufImgGpu.ptr == NULL) {
194 throw LSST_EXCEPT(afwGpu::GpuMemoryError,
"Not enough memory on GPU for input image");
196 srcImgGpu.img = srcBufImgGpu.ptr;
197 srcImgGpu.var = NULL;
198 srcImgGpu.msk = NULL;
199 srcImgGpu.width = srcImage.
getWidth();
202 srcPosInterpGpu.Transfer(srcPosInterp);
203 if (srcPosInterpGpu.ptr == NULL) {
205 "Not enough memory on GPU for interpolation data for coorinate transformation");
211 destImgGpu, srcImgGpu,
217 srcPosInterpGpu.ptr, interpLength
220 int numGoodPixels = NumGoodPixels(srcPosInterp, interpLength, destWidth, destHeight, srcBoxConv);
222 cudaThreadSynchronize();
223 if (cudaGetLastError() != cudaSuccess) {
224 throw LSST_EXCEPT(afwGpu::GpuRuntimeError,
"GPU calculation failed to run");
227 destBufImgGpu.CopyToImageBase(destImage);
228 return numGoodPixels;
234 template<
typename DestPixelT,
typename SrcPixelT>
235 int WarpImageGpuWrapper(
243 int const interpLength,
249 typename DestImageT::SinglePixel
const edgePixel = padValue;
251 gpu::PixelIVM<DestPixelT> edgePixelGpu;
252 edgePixelGpu.img = edgePixel.image();
253 edgePixelGpu.var = edgePixel.variance();
254 edgePixelGpu.msk = edgePixel.mask();
256 int const destWidth = dstImage.
getWidth();
257 int const destHeight = dstImage.
getHeight();
259 gpuDetail::GpuMemOwner<DestPixelT> destBufImgGpu;
260 gpuDetail::GpuMemOwner<gpu::VarPixel> destBufVarGpu;
261 gpuDetail::GpuMemOwner<gpu::MskPixel> destBufMskGpu;
263 gpuDetail::GpuMemOwner<SrcPixelT> srcBufImgGpu;
264 gpuDetail::GpuMemOwner<gpu::VarPixel> srcBufVarGpu;
265 gpuDetail::GpuMemOwner<gpu::MskPixel> srcBufMskGpu;
267 gpuDetail::GpuMemOwner<gpu::BilinearInterp> srcPosInterpGpu;
269 mathDetail::gpu::ImageDataPtr<DestPixelT> destImgGpu;
270 destImgGpu.strideImg = destBufImgGpu.AllocImageBaseBuffer(*dstImage.
getImage());
271 destImgGpu.strideVar = destBufVarGpu.AllocImageBaseBuffer(*dstImage.
getVariance());
272 destImgGpu.strideMsk = destBufMskGpu.AllocImageBaseBuffer(*dstImage.
getMask());
273 if (destBufImgGpu.ptr == NULL) {
274 throw LSST_EXCEPT(afwGpu::GpuMemoryError,
"Not enough memory on GPU for output image");
276 if (destBufVarGpu.ptr == NULL) {
277 throw LSST_EXCEPT(afwGpu::GpuMemoryError,
"Not enough memory on GPU for output variance");
279 if (destBufMskGpu.ptr == NULL) {
280 throw LSST_EXCEPT(afwGpu::GpuMemoryError,
"Not enough memory on GPU for output mask");
282 destImgGpu.img = destBufImgGpu.ptr;
283 destImgGpu.var = destBufVarGpu.ptr;
284 destImgGpu.msk = destBufMskGpu.ptr;
285 destImgGpu.width = destWidth;
286 destImgGpu.height = destHeight;
288 gpu::ImageDataPtr<SrcPixelT> srcImgGpu;
289 srcImgGpu.strideImg = srcBufImgGpu.TransferFromImageBase(*srcImage.
getImage());
290 if (srcBufImgGpu.ptr == NULL) {
291 throw LSST_EXCEPT(afwGpu::GpuMemoryError,
"Not enough memory on GPU for input image");
293 srcImgGpu.strideVar = srcBufVarGpu.TransferFromImageBase(*srcImage.
getVariance());
294 if (srcBufVarGpu.ptr == NULL) {
295 throw LSST_EXCEPT(afwGpu::GpuMemoryError,
"Not enough memory on GPU for input variance");
297 srcImgGpu.strideMsk = srcBufMskGpu.TransferFromImageBase(*srcImage.
getMask());
298 if (srcBufMskGpu.ptr == NULL) {
299 throw LSST_EXCEPT(afwGpu::GpuMemoryError,
"Not enough memory on GPU for input mask");
302 srcImgGpu.img = srcBufImgGpu.ptr;
303 srcImgGpu.var = srcBufVarGpu.ptr;
304 srcImgGpu.msk = srcBufMskGpu.ptr;
305 srcImgGpu.width = srcImage.
getWidth();
308 srcPosInterpGpu.Transfer(srcPosInterp);
309 if (srcPosInterpGpu.ptr == NULL) {
311 "Not enough memory on GPU for interpolation data for coorinate transformation");
317 destImgGpu, srcImgGpu,
323 srcPosInterpGpu.ptr, interpLength
325 int numGoodPixels = NumGoodPixels(srcPosInterp, interpLength, destWidth, destHeight, srcBoxConv);
327 cudaThreadSynchronize();
328 if (cudaGetLastError() != cudaSuccess) {
329 throw LSST_EXCEPT(afwGpu::GpuRuntimeError,
"GPU calculation failed to run");
332 destBufImgGpu.CopyToImageBase(*dstImage.
getImage());
333 destBufVarGpu.CopyToImageBase(*dstImage.
getVariance());
334 destBufMskGpu.CopyToImageBase(*dstImage.
getMask());
336 return numGoodPixels;
349 int destWidth,
int destHeight)
351 int const interpBlkNX = InterpBlkN(destWidth , interpLength);
352 int const interpBlkNY = InterpBlkN(destHeight, interpLength);
354 for (
int row = -1, rowBand = 0; rowBand < interpBlkNY - 1;
row += interpLength, rowBand++) {
355 double const invInterpLen = 1.0 / interpLength;
356 double const invInterpLenRow =
row + interpLength <= destHeight - 1 ?
357 invInterpLen : 1.0 / (destHeight - 1 -
row);
359 for (
int col = -1, colBand = 0; colBand < interpBlkNX - 1;
col += interpLength, colBand++) {
361 const SPoint2 p11 = srcPosInterp.
Pixel(colBand , rowBand ).o;
362 const SPoint2 p12 = srcPosInterp.
Pixel(colBand + 1, rowBand ).o;
363 const SPoint2 p21 = srcPosInterp.
Pixel(colBand , rowBand + 1).o;
364 const SPoint2 p22 = srcPosInterp.
Pixel(colBand + 1, rowBand + 1).o;
370 double const invInterpLenCol =
col + interpLength <= destWidth - 1 ?
371 invInterpLen : 1.0 / (destWidth - 1 -
col);
373 gpu::BilinearInterp lin = srcPosInterp.
Pixel(colBand, rowBand);
374 lin.deltaY =
VecMul(band_dY , invInterpLenRow);
375 lin.d0X =
VecMul(band_d0X, invInterpLenCol);
376 lin.ddX =
VecMul(band_ddX, invInterpLenCol);
377 srcPosInterp.
Pixel(colBand, rowBand) = lin;
380 if (colBand == interpBlkNX - 2) {
381 srcPosInterp.
Pixel(interpBlkNX - 1, rowBand).deltaY =
384 if (rowBand == interpBlkNY - 2) {
385 srcPosInterp.
Pixel(colBand, interpBlkNY - 1).d0X =
395 template<
typename DestImageT,
typename SrcImageT>
397 DestImageT &destImage,
398 SrcImageT
const &srcImage,
402 int const interpLength,
403 typename DestImageT::SinglePixel padValue,
404 const bool forceProcessing
407 if (interpLength < 1) {
408 throw LSST_EXCEPT(pexExcept::InvalidParameterError,
409 "GPU accelerated warping must use interpolation");
412 int const srcWidth = srcImage.getWidth();
413 int const srcHeight = srcImage.getHeight();
414 LOGL_DEBUG(
"TRACE2.afw.math.warp",
"(GPU) source image width=%d; height=%d", srcWidth, srcHeight);
417 throw LSST_EXCEPT(afwGpu::GpuRuntimeError,
"Afw not compiled with GPU support");
423 if (dynamic_cast<afwMath::LanczosWarpingKernel const*>(&maskWarpingKernel)) {
425 }
else if (dynamic_cast<afwMath::BilinearWarpingKernel const*>(&maskWarpingKernel)) {
427 }
else if (dynamic_cast<afwMath::NearestWarpingKernel const*>(&maskWarpingKernel)) {
430 throw LSST_EXCEPT(pexExcept::InvalidParameterError,
"unknown type of mask warping kernel");
439 int const mainKernelSize = 2 * lanczosKernel.
getOrder();
446 if (!forceProcessing && interpLength < 3) {
450 int const destWidth = destImage.getWidth();
451 int const destHeight = destImage.getHeight();
452 LOGL_DEBUG(
"TRACE2.afw.math.warp",
"(GPU) remap image width=%d; height=%d", destWidth, destHeight);
454 int const maxCol = destWidth - 1;
455 int const maxRow = destHeight - 1;
462 int const interpBlkNX = InterpBlkN(destWidth , interpLength);
463 int const interpBlkNY = InterpBlkN(destHeight, interpLength);
468 for (
int rowBand = 0; rowBand < interpBlkNY; rowBand++) {
469 int row = min(maxRow, (rowBand * interpLength - 1));
470 for (
int colBand = 0; colBand < interpBlkNX; colBand++) {
471 int col = min(maxCol, (colBand * interpLength - 1));
474 sSrcPos =
MovePoint(sSrcPos,
SVec2(-srcImage.getX0(), -srcImage.getY0()));
475 srcPosInterp.
Pixel(colBand, rowBand).o = sSrcPos;
479 CalculateInterpolationData(srcPosInterp, interpLength, destWidth, destHeight);
481 int numGoodPixels = 0;
483 LOGL_DEBUG(
"TRACE2.afw.math.warp",
"using GPU acceleration, remapping masked image");
492 numGoodPixels = WarpImageGpuWrapper(destImage,
498 srcPosInterp, interpLength, padValue
508 #define MASKEDIMAGE(PIXTYPE) afwImage::MaskedImage<PIXTYPE, afwImage::MaskPixel, afwImage::VariancePixel>
509 #define IMAGE(PIXTYPE) afwImage::Image<PIXTYPE>
512 #define INSTANTIATE(DESTIMAGEPIXELT, SRCIMAGEPIXELT) \
513 template std::pair<int,WarpImageGpuStatus::ReturnCode> warpImageGPU( \
514 IMAGE(DESTIMAGEPIXELT) &destImage, \
515 IMAGE(SRCIMAGEPIXELT) const &srcImage, \
516 afwMath::LanczosWarpingKernel const &warpingKernel, \
517 afwMath::SeparableKernel const &maskWarpingKernel, \
518 PositionFunctor const &computeSrcPos, \
519 int const interpLength, \
520 IMAGE(DESTIMAGEPIXELT)::SinglePixel padValue, \
521 const bool forceProcessing); NL \
522 template std::pair<int,WarpImageGpuStatus::ReturnCode> warpImageGPU( \
523 MASKEDIMAGE(DESTIMAGEPIXELT) &destImage, \
524 MASKEDIMAGE(SRCIMAGEPIXELT) const &srcImage, \
525 afwMath::LanczosWarpingKernel const &warpingKernel, \
526 afwMath::SeparableKernel const &maskWarpingKernel, \
527 PositionFunctor const &computeSrcPos, \
528 int const interpLength, \
529 MASKEDIMAGE(DESTIMAGEPIXELT)::SinglePixel padValue, \
530 const bool forceProcessing);
bool TryToSelectCudaDevice(bool noExceptions, bool reselect)
GPU accelerared image warping.
Declare the Kernel class and subclasses.
Class for representing an image or 2D array in general)
SPoint2 MovePoint(SPoint2 p, SVec2 v)
Declaration of a GPU kernel for image warping and declarations of requred datatypes.
#define LOGL_DEBUG(logger, message...)
SVec2 VecMul(SVec2 v, double m)
additional GPU exceptions
int getHeight() const
Return the number of rows in the image.
A kernel described by a pair of functions: func(x, y) = colFunc(x) * rowFunc(y)
bool isInsideBox(gpu::SPoint2 p)
ImagePtr getImage(bool const noThrow=false) const
Return a (Ptr to) the MaskedImage's image.
void WarpImageGpuCallKernel(bool isMaskedImage, ImageDataPtr< DestPixelT > destImageGpu, ImageDataPtr< SrcPixelT > srcImageGpu, int mainKernelSize, KernelType maskKernelType, int maskKernelSize, SBox2I srcGoodBox, PixelIVM< DestPixelT > edgePixel, BilinearInterp *srcPosInterp, int interpLength)
Calls the GPU kernel for lanczos resampling.
defines a 2D range of integer values begX <= x < endX, begY <= y < endY
int const SIZE_MAX_WARPING_KERNEL
contains GpuBuffer2D class (for simple handling of images or 2D arrays)
An integer coordinate rectangle.
VariancePtr getVariance(bool const noThrow=false) const
Return a (Ptr to) the MaskedImage's variance.
table::Key< table::Array< Kernel::Pixel > > image
int getOrder() const
get the order of the kernel
std::pair< int, WarpImageGpuStatus::ReturnCode > warpImageGPU(DestImageT &destImage, SrcImageT const &srcImage, afwMath::LanczosWarpingKernel const &lanczosKernel, lsst::afw::math::SeparableKernel const &maskWarpingKernel, PositionFunctor const &computeSrcPos, int const interpLength, typename DestImageT::SinglePixel padValue, const bool forceProcessing)
GPU accelerated image warping using Lanczos resampling.
Base class to transform pixel position for a destination image to its position in the original source...
Simple 2D point (suitable for use on a GPU)
MaskPtr getMask(bool const noThrow=false) const
Return a (Ptr to) the MaskedImage's mask.
A class to manipulate images, masks, and variance as a single object.
bool isGpuBuild()
Inline function which returns true only when GPU_BUILD macro is defined.
int getHeight() const
Return the number of rows in the image.
ImageT::SinglePixel edgePixel(lsst::afw::image::detail::Image_tag)
Return an off-the-edge pixel appropriate for a given Image type.
Simple 2D vector (suitable for use on a GPU)
Functions to help managing setup for GPU kernels.
#define LSST_EXCEPT(type,...)
Support for warping an image to a new WCS.
PixelT & Pixel(int x, int y)
int getWidth() const
Return the number of columns in the image.
Lanczos warping: accurate but slow and can introduce ringing artifacts.
Implementation of the Class MaskedImage.
SVec2 VecSub(SVec2 a, SVec2 b)
int getWidth() const
Return the number of columns in the image.
Functions and a class to help allocating GPU global memory and transferring data to and from a GPU...
A class to represent a 2-dimensional array of pixels.
lsst::afw::geom::Box2I shrinkBBox(lsst::afw::geom::Box2I const &bbox) const
A function to determine whether compiling for GPU is enabled.