37 #include <cuda_runtime.h>
63 namespace mathDetail = lsst::afw::math::detail;
64 namespace gpuDetail = lsst::afw::gpu::detail;
65 namespace afwMath = lsst::afw::math;
66 namespace afwGpu = lsst::afw::gpu;
68 namespace pexExcept = lsst::pex::exceptions;
69 namespace pexLog = lsst::pex::logging;
70 namespace afwGeom = lsst::afw::geom;
79 int CeilDivide(
int num,
int divisor)
81 return (num + divisor - 1) / divisor;
85 int InterpBlkN(
int size ,
int interpLength)
87 return CeilDivide(size , interpLength) + 1;
91 gpu::SPoint2 GetInterpolatedValue(afwGpu::detail::GpuBuffer2D<gpu::BilinearInterp>
const & interpBuf,
92 int blkX,
int blkY,
int subX,
int subY
95 gpu::BilinearInterp interp = interpBuf.Pixel(blkX, blkY);
96 return interp.Interpolate(subX, subY);
100 gpu::SPoint2 GetInterpolatedValue(afwGpu::detail::GpuBuffer2D<gpu::BilinearInterp>
const & interpBuf,
101 int interpLen,
int x,
int y
104 int blkX = x / interpLen;
105 int blkY = y / interpLen;
107 int subX = x % interpLen;
108 int subY = y % interpLen;
110 return GetInterpolatedValue(interpBuf, blkX, blkY, subX, subY);
115 int NumGoodPixels(afwGpu::detail::GpuBuffer2D<gpu::BilinearInterp>
const & interpBuf,
116 int const interpLen,
int const width,
int const height,
SBox2I srcGoodBox)
120 int subY = 1, blkY = 0;
121 for (
int row = 0;
row < height;
row++, subY++) {
122 if (subY >= interpLen) {
127 int subX = 1, blkX = 0;
128 gpu::BilinearInterp interp = interpBuf.Pixel(blkX, blkY);
129 gpu::LinearInterp lineY = interp.GetLinearInterp(subY);
131 for (
int col = 0;
col < width;
col++, subX++) {
132 if (subX >= interpLen) {
135 interp = interpBuf.Pixel(blkX, blkY);
136 lineY = interp.GetLinearInterp(subY);
138 gpu::SPoint2 srcPos = lineY.Interpolate(subX);
151 template<
typename DestPixelT,
typename SrcPixelT>
152 int WarpImageGpuWrapper(
160 int const interpLength,
166 typename DestImageT::SinglePixel
const edgePixel = padValue;
168 gpu::PixelIVM<DestPixelT> edgePixelGpu;
170 edgePixelGpu.var = -1;
171 edgePixelGpu.msk = -1;
173 int const destWidth = destImage.
getWidth();
174 int const destHeight = destImage.
getHeight();
175 gpuDetail::GpuMemOwner<DestPixelT> destBufImgGpu;
176 gpuDetail::GpuMemOwner<SrcPixelT> srcBufImgGpu;
177 gpuDetail::GpuMemOwner<gpu::BilinearInterp> srcPosInterpGpu;
179 gpu::ImageDataPtr<DestPixelT> destImgGpu;
180 destImgGpu.strideImg = destBufImgGpu.AllocImageBaseBuffer(destImage);
181 if (destBufImgGpu.ptr == NULL) {
182 throw LSST_EXCEPT(afwGpu::GpuMemoryError,
"Not enough memory on GPU for output image");
184 destImgGpu.img = destBufImgGpu.ptr;
185 destImgGpu.var = NULL;
186 destImgGpu.msk = NULL;
187 destImgGpu.width = destWidth;
188 destImgGpu.height = destHeight;
190 gpu::ImageDataPtr<SrcPixelT> srcImgGpu;
191 srcImgGpu.strideImg = srcBufImgGpu.TransferFromImageBase(srcImage);
192 if (srcBufImgGpu.ptr == NULL) {
193 throw LSST_EXCEPT(afwGpu::GpuMemoryError,
"Not enough memory on GPU for input image");
195 srcImgGpu.img = srcBufImgGpu.ptr;
196 srcImgGpu.var = NULL;
197 srcImgGpu.msk = NULL;
198 srcImgGpu.width = srcImage.
getWidth();
201 srcPosInterpGpu.Transfer(srcPosInterp);
202 if (srcPosInterpGpu.ptr == NULL) {
204 "Not enough memory on GPU for interpolation data for coorinate transformation");
210 destImgGpu, srcImgGpu,
216 srcPosInterpGpu.ptr, interpLength
219 int numGoodPixels = NumGoodPixels(srcPosInterp, interpLength, destWidth, destHeight, srcBoxConv);
221 cudaThreadSynchronize();
222 if (cudaGetLastError() != cudaSuccess) {
223 throw LSST_EXCEPT(afwGpu::GpuRuntimeError,
"GPU calculation failed to run");
226 destBufImgGpu.CopyToImageBase(destImage);
227 return numGoodPixels;
233 template<
typename DestPixelT,
typename SrcPixelT>
234 int WarpImageGpuWrapper(
242 int const interpLength,
248 typename DestImageT::SinglePixel
const edgePixel = padValue;
250 gpu::PixelIVM<DestPixelT> edgePixelGpu;
251 edgePixelGpu.img = edgePixel.image();
252 edgePixelGpu.var = edgePixel.variance();
253 edgePixelGpu.msk = edgePixel.mask();
255 int const destWidth = dstImage.
getWidth();
256 int const destHeight = dstImage.
getHeight();
258 gpuDetail::GpuMemOwner<DestPixelT> destBufImgGpu;
259 gpuDetail::GpuMemOwner<gpu::VarPixel> destBufVarGpu;
260 gpuDetail::GpuMemOwner<gpu::MskPixel> destBufMskGpu;
262 gpuDetail::GpuMemOwner<SrcPixelT> srcBufImgGpu;
263 gpuDetail::GpuMemOwner<gpu::VarPixel> srcBufVarGpu;
264 gpuDetail::GpuMemOwner<gpu::MskPixel> srcBufMskGpu;
266 gpuDetail::GpuMemOwner<gpu::BilinearInterp> srcPosInterpGpu;
268 mathDetail::gpu::ImageDataPtr<DestPixelT> destImgGpu;
269 destImgGpu.strideImg = destBufImgGpu.AllocImageBaseBuffer(*dstImage.
getImage());
270 destImgGpu.strideVar = destBufVarGpu.AllocImageBaseBuffer(*dstImage.
getVariance());
271 destImgGpu.strideMsk = destBufMskGpu.AllocImageBaseBuffer(*dstImage.
getMask());
272 if (destBufImgGpu.ptr == NULL) {
273 throw LSST_EXCEPT(afwGpu::GpuMemoryError,
"Not enough memory on GPU for output image");
275 if (destBufVarGpu.ptr == NULL) {
276 throw LSST_EXCEPT(afwGpu::GpuMemoryError,
"Not enough memory on GPU for output variance");
278 if (destBufMskGpu.ptr == NULL) {
279 throw LSST_EXCEPT(afwGpu::GpuMemoryError,
"Not enough memory on GPU for output mask");
281 destImgGpu.img = destBufImgGpu.ptr;
282 destImgGpu.var = destBufVarGpu.ptr;
283 destImgGpu.msk = destBufMskGpu.ptr;
284 destImgGpu.width = destWidth;
285 destImgGpu.height = destHeight;
287 gpu::ImageDataPtr<SrcPixelT> srcImgGpu;
288 srcImgGpu.strideImg = srcBufImgGpu.TransferFromImageBase(*srcImage.
getImage());
289 if (srcBufImgGpu.ptr == NULL) {
290 throw LSST_EXCEPT(afwGpu::GpuMemoryError,
"Not enough memory on GPU for input image");
292 srcImgGpu.strideVar = srcBufVarGpu.TransferFromImageBase(*srcImage.
getVariance());
293 if (srcBufVarGpu.ptr == NULL) {
294 throw LSST_EXCEPT(afwGpu::GpuMemoryError,
"Not enough memory on GPU for input variance");
296 srcImgGpu.strideMsk = srcBufMskGpu.TransferFromImageBase(*srcImage.
getMask());
297 if (srcBufMskGpu.ptr == NULL) {
298 throw LSST_EXCEPT(afwGpu::GpuMemoryError,
"Not enough memory on GPU for input mask");
301 srcImgGpu.img = srcBufImgGpu.ptr;
302 srcImgGpu.var = srcBufVarGpu.ptr;
303 srcImgGpu.msk = srcBufMskGpu.ptr;
304 srcImgGpu.width = srcImage.
getWidth();
307 srcPosInterpGpu.Transfer(srcPosInterp);
308 if (srcPosInterpGpu.ptr == NULL) {
310 "Not enough memory on GPU for interpolation data for coorinate transformation");
316 destImgGpu, srcImgGpu,
322 srcPosInterpGpu.ptr, interpLength
324 int numGoodPixels = NumGoodPixels(srcPosInterp, interpLength, destWidth, destHeight, srcBoxConv);
326 cudaThreadSynchronize();
327 if (cudaGetLastError() != cudaSuccess) {
328 throw LSST_EXCEPT(afwGpu::GpuRuntimeError,
"GPU calculation failed to run");
331 destBufImgGpu.CopyToImageBase(*dstImage.
getImage());
332 destBufVarGpu.CopyToImageBase(*dstImage.
getVariance());
333 destBufMskGpu.CopyToImageBase(*dstImage.
getMask());
335 return numGoodPixels;
348 int destWidth,
int destHeight)
350 int const interpBlkNX = InterpBlkN(destWidth , interpLength);
351 int const interpBlkNY = InterpBlkN(destHeight, interpLength);
353 for (
int row = -1, rowBand = 0; rowBand < interpBlkNY - 1;
row += interpLength, rowBand++) {
354 double const invInterpLen = 1.0 / interpLength;
355 double const invInterpLenRow =
row + interpLength <= destHeight - 1 ?
356 invInterpLen : 1.0 / (destHeight - 1 -
row);
358 for (
int col = -1, colBand = 0; colBand < interpBlkNX - 1;
col += interpLength, colBand++) {
360 const SPoint2 p11 = srcPosInterp.
Pixel(colBand , rowBand ).o;
361 const SPoint2 p12 = srcPosInterp.
Pixel(colBand + 1, rowBand ).o;
362 const SPoint2 p21 = srcPosInterp.
Pixel(colBand , rowBand + 1).o;
363 const SPoint2 p22 = srcPosInterp.
Pixel(colBand + 1, rowBand + 1).o;
369 double const invInterpLenCol =
col + interpLength <= destWidth - 1 ?
370 invInterpLen : 1.0 / (destWidth - 1 -
col);
372 gpu::BilinearInterp lin = srcPosInterp.
Pixel(colBand, rowBand);
373 lin.deltaY =
VecMul(band_dY , invInterpLenRow);
374 lin.d0X =
VecMul(band_d0X, invInterpLenCol);
375 lin.ddX =
VecMul(band_ddX, invInterpLenCol);
376 srcPosInterp.
Pixel(colBand, rowBand) = lin;
379 if (colBand == interpBlkNX - 2) {
380 srcPosInterp.
Pixel(interpBlkNX - 1, rowBand).deltaY =
383 if (rowBand == interpBlkNY - 2) {
384 srcPosInterp.
Pixel(colBand, interpBlkNY - 1).d0X =
394 template<
typename DestImageT,
typename SrcImageT>
396 DestImageT &destImage,
397 SrcImageT
const &srcImage,
401 int const interpLength,
402 typename DestImageT::SinglePixel padValue,
403 const bool forceProcessing
406 if (interpLength < 1) {
407 throw LSST_EXCEPT(pexExcept::InvalidParameterError,
408 "GPU accelerated warping must use interpolation");
411 int const srcWidth = srcImage.getWidth();
412 int const srcHeight = srcImage.getHeight();
413 pexLog::TTrace<3>(
"lsst.afw.math.warp",
"(GPU) source image width=%d; height=%d", srcWidth, srcHeight);
416 throw LSST_EXCEPT(afwGpu::GpuRuntimeError,
"Afw not compiled with GPU support");
422 if (dynamic_cast<afwMath::LanczosWarpingKernel const*>(&maskWarpingKernel)) {
424 }
else if (dynamic_cast<afwMath::BilinearWarpingKernel const*>(&maskWarpingKernel)) {
426 }
else if (dynamic_cast<afwMath::NearestWarpingKernel const*>(&maskWarpingKernel)) {
429 throw LSST_EXCEPT(pexExcept::InvalidParameterError,
"unknown type of mask warping kernel");
438 int const mainKernelSize = 2 * lanczosKernel.
getOrder();
445 if (!forceProcessing && interpLength < 3) {
449 int const destWidth = destImage.getWidth();
450 int const destHeight = destImage.getHeight();
451 pexLog::TTrace<3>(
"lsst.afw.math.warp",
"(GPU) remap image width=%d; height=%d", destWidth, destHeight);
453 int const maxCol = destWidth - 1;
454 int const maxRow = destHeight - 1;
461 int const interpBlkNX = InterpBlkN(destWidth , interpLength);
462 int const interpBlkNY = InterpBlkN(destHeight, interpLength);
467 for (
int rowBand = 0; rowBand < interpBlkNY; rowBand++) {
468 int row = min(maxRow, (rowBand * interpLength - 1));
469 for (
int colBand = 0; colBand < interpBlkNX; colBand++) {
470 int col = min(maxCol, (colBand * interpLength - 1));
473 sSrcPos =
MovePoint(sSrcPos,
SVec2(-srcImage.getX0(), -srcImage.getY0()));
474 srcPosInterp.
Pixel(colBand, rowBand).o = sSrcPos;
478 CalculateInterpolationData(srcPosInterp, interpLength, destWidth, destHeight);
480 int numGoodPixels = 0;
482 pexLog::TTrace<3>(
"lsst.afw.math.warp",
"using GPU acceleration, remapping masked image");
491 numGoodPixels = WarpImageGpuWrapper(destImage,
497 srcPosInterp, interpLength, padValue
507 #define MASKEDIMAGE(PIXTYPE) afwImage::MaskedImage<PIXTYPE, afwImage::MaskPixel, afwImage::VariancePixel>
508 #define IMAGE(PIXTYPE) afwImage::Image<PIXTYPE>
511 #define INSTANTIATE(DESTIMAGEPIXELT, SRCIMAGEPIXELT) \
512 template std::pair<int,WarpImageGpuStatus::ReturnCode> warpImageGPU( \
513 IMAGE(DESTIMAGEPIXELT) &destImage, \
514 IMAGE(SRCIMAGEPIXELT) const &srcImage, \
515 afwMath::LanczosWarpingKernel const &warpingKernel, \
516 afwMath::SeparableKernel const &maskWarpingKernel, \
517 PositionFunctor const &computeSrcPos, \
518 int const interpLength, \
519 IMAGE(DESTIMAGEPIXELT)::SinglePixel padValue, \
520 const bool forceProcessing); NL \
521 template std::pair<int,WarpImageGpuStatus::ReturnCode> warpImageGPU( \
522 MASKEDIMAGE(DESTIMAGEPIXELT) &destImage, \
523 MASKEDIMAGE(SRCIMAGEPIXELT) const &srcImage, \
524 afwMath::LanczosWarpingKernel const &warpingKernel, \
525 afwMath::SeparableKernel const &maskWarpingKernel, \
526 PositionFunctor const &computeSrcPos, \
527 int const interpLength, \
528 MASKEDIMAGE(DESTIMAGEPIXELT)::SinglePixel padValue, \
529 const bool forceProcessing);
GPU accelerared image warping.
int getWidth() const
Return the number of columns in the image.
ImagePtr getImage(bool const noThrow=false) const
Return a (Ptr to) the MaskedImage's image.
Declare the Kernel class and subclasses.
Class for representing an image or 2D array in general)
SPoint2 MovePoint(SPoint2 p, SVec2 v)
Declaration of a GPU kernel for image warping and declarations of requred datatypes.
SVec2 VecMul(SVec2 v, double m)
additional GPU exceptions
definition of the Trace messaging facilities
A kernel described by a pair of functions: func(x, y) = colFunc(x) * rowFunc(y)
bool TryToSelectCudaDevice(bool noExceptions, bool reselect=false)
void WarpImageGpuCallKernel(bool isMaskedImage, ImageDataPtr< DestPixelT > destImageGpu, ImageDataPtr< SrcPixelT > srcImageGpu, int mainKernelSize, KernelType maskKernelType, int maskKernelSize, SBox2I srcGoodBox, PixelIVM< DestPixelT > edgePixel, BilinearInterp *srcPosInterp, int interpLength)
Calls the GPU kernel for lanczos resampling.
defines a 2D range of integer values begX <= x < endX, begY <= y < endY
int const SIZE_MAX_WARPING_KERNEL
contains GpuBuffer2D class (for simple handling of images or 2D arrays)
bool isInsideBox(gpu::SPoint2 p)
An integer coordinate rectangle.
table::Key< table::Array< Kernel::Pixel > > image
VariancePtr getVariance(bool const noThrow=false) const
Return a (Ptr to) the MaskedImage's variance.
int getWidth() const
Return the number of columns in the image.
Base class to transform pixel position for a destination image to its position in the original source...
int getOrder() const
get the order of the kernel
Simple 2D point (suitable for use on a GPU)
A class to manipulate images, masks, and variance as a single object.
bool isGpuBuild()
Inline function which returns true only when GPU_BUILD macro is defined.
std::pair< int, WarpImageGpuStatus::ReturnCode > warpImageGPU(DestImageT &destImage, SrcImageT const &srcImage, lsst::afw::math::LanczosWarpingKernel const &warpingKernel, lsst::afw::math::SeparableKernel const &maskWarpingKernel, PositionFunctor const &computeSrcPos, int const interpLength, typename DestImageT::SinglePixel padValue, const bool forceProcessing=true)
GPU accelerated image warping using Lanczos resampling.
ImageT::SinglePixel edgePixel(lsst::afw::image::detail::Image_tag)
Return an off-the-edge pixel appropriate for a given Image type.
Simple 2D vector (suitable for use on a GPU)
Functions to help managing setup for GPU kernels.
#define LSST_EXCEPT(type,...)
MaskPtr getMask(bool const noThrow=false) const
Return a (Ptr to) the MaskedImage's mask.
Support for warping an image to a new WCS.
Lanczos warping: accurate but slow and can introduce ringing artifacts.
Implementation of the Class MaskedImage.
int getHeight() const
Return the number of rows in the image.
SVec2 VecSub(SVec2 a, SVec2 b)
lsst::afw::geom::Box2I shrinkBBox(lsst::afw::geom::Box2I const &bbox) const
int getHeight() const
Return the number of rows in the image.
PixelT & Pixel(int x, int y)
Functions and a class to help allocating GPU global memory and transferring data to and from a GPU...
A class to represent a 2-dimensional array of pixels.
A function to determine whether compiling for GPU is enabled.