37 #include <cuda_runtime.h>
63 namespace mathDetail = lsst::afw::math::detail;
64 namespace gpuDetail = lsst::afw::gpu::detail;
65 namespace afwMath = lsst::afw::math;
66 namespace afwGpu = lsst::afw::gpu;
68 namespace pexExcept = lsst::pex::exceptions;
69 namespace pexLog = lsst::pex::logging;
70 namespace afwGeom = lsst::afw::geom;
79 int CeilDivide(
int num,
int divisor)
81 return (num + divisor - 1) / divisor;
85 int InterpBlkN(
int size ,
int interpLength)
87 return CeilDivide(size , interpLength) + 1;
91 gpu::SPoint2 GetInterpolatedValue(afwGpu::detail::GpuBuffer2D<gpu::BilinearInterp>
const & interpBuf,
92 int blkX,
int blkY,
int subX,
int subY
95 gpu::BilinearInterp interp = interpBuf.Pixel(blkX, blkY);
96 return interp.Interpolate(subX, subY);
100 gpu::SPoint2 GetInterpolatedValue(afwGpu::detail::GpuBuffer2D<gpu::BilinearInterp>
const & interpBuf,
101 int interpLen,
int x,
int y
104 int blkX = x / interpLen;
105 int blkY = y / interpLen;
107 int subX = x % interpLen;
108 int subY = y % interpLen;
110 return GetInterpolatedValue(interpBuf, blkX, blkY, subX, subY);
115 int NumGoodPixels(afwGpu::detail::GpuBuffer2D<gpu::BilinearInterp>
const & interpBuf,
120 int subY = 1, blkY = 0;
122 if (subY >= interpLen) {
127 int subX = 1, blkX = 0;
128 gpu::BilinearInterp interp = interpBuf.Pixel(blkX, blkY);
129 gpu::LinearInterp lineY = interp.GetLinearInterp(subY);
132 if (subX >= interpLen) {
135 interp = interpBuf.Pixel(blkX, blkY);
136 lineY = interp.GetLinearInterp(subY);
138 gpu::SPoint2 srcPos = lineY.Interpolate(subX);
151 template<
typename DestPixelT,
typename SrcPixelT>
152 int WarpImageGpuWrapper(
160 int const interpLength,
166 typename DestImageT::SinglePixel
const edgePixel = padValue;
168 gpu::PixelIVM<DestPixelT> edgePixelGpu;
170 edgePixelGpu.var = -1;
171 edgePixelGpu.msk = -1;
173 int const destWidth = destImage.
getWidth();
174 int const destHeight = destImage.
getHeight();
175 gpuDetail::GpuMemOwner<DestPixelT> destBufImgGpu;
176 gpuDetail::GpuMemOwner<SrcPixelT> srcBufImgGpu;
177 gpuDetail::GpuMemOwner<gpu::BilinearInterp> srcPosInterpGpu;
179 gpu::ImageDataPtr<DestPixelT> destImgGpu;
180 destImgGpu.strideImg = destBufImgGpu.AllocImageBaseBuffer(destImage);
181 if (destBufImgGpu.ptr == NULL) {
182 throw LSST_EXCEPT(afwGpu::GpuMemoryError,
"Not enough memory on GPU for output image");
184 destImgGpu.img = destBufImgGpu.ptr;
185 destImgGpu.var = NULL;
186 destImgGpu.msk = NULL;
187 destImgGpu.width = destWidth;
188 destImgGpu.height = destHeight;
190 gpu::ImageDataPtr<SrcPixelT> srcImgGpu;
191 srcImgGpu.strideImg = srcBufImgGpu.TransferFromImageBase(srcImage);
192 if (srcBufImgGpu.ptr == NULL) {
193 throw LSST_EXCEPT(afwGpu::GpuMemoryError,
"Not enough memory on GPU for input image");
195 srcImgGpu.img = srcBufImgGpu.ptr;
196 srcImgGpu.var = NULL;
197 srcImgGpu.msk = NULL;
198 srcImgGpu.width = srcImage.
getWidth();
201 srcPosInterpGpu.Transfer(srcPosInterp);
202 if (srcPosInterpGpu.ptr == NULL) {
204 "Not enough memory on GPU for interpolation data for coorinate transformation");
210 destImgGpu, srcImgGpu,
216 srcPosInterpGpu.ptr, interpLength
219 int numGoodPixels = NumGoodPixels(srcPosInterp, interpLength, destWidth, destHeight, srcBoxConv);
221 cudaThreadSynchronize();
222 if (cudaGetLastError() != cudaSuccess) {
223 throw LSST_EXCEPT(afwGpu::GpuRuntimeError,
"GPU calculation failed to run");
226 destBufImgGpu.CopyToImageBase(destImage);
227 return numGoodPixels;
233 template<
typename DestPixelT,
typename SrcPixelT>
234 int WarpImageGpuWrapper(
242 int const interpLength,
248 typename DestImageT::SinglePixel
const edgePixel = padValue;
250 gpu::PixelIVM<DestPixelT> edgePixelGpu;
251 edgePixelGpu.img = edgePixel.image();
252 edgePixelGpu.var = edgePixel.variance();
253 edgePixelGpu.msk = edgePixel.mask();
255 int const destWidth = dstImage.
getWidth();
256 int const destHeight = dstImage.
getHeight();
258 gpuDetail::GpuMemOwner<DestPixelT> destBufImgGpu;
259 gpuDetail::GpuMemOwner<gpu::VarPixel> destBufVarGpu;
260 gpuDetail::GpuMemOwner<gpu::MskPixel> destBufMskGpu;
262 gpuDetail::GpuMemOwner<SrcPixelT> srcBufImgGpu;
263 gpuDetail::GpuMemOwner<gpu::VarPixel> srcBufVarGpu;
264 gpuDetail::GpuMemOwner<gpu::MskPixel> srcBufMskGpu;
266 gpuDetail::GpuMemOwner<gpu::BilinearInterp> srcPosInterpGpu;
268 mathDetail::gpu::ImageDataPtr<DestPixelT> destImgGpu;
269 destImgGpu.strideImg = destBufImgGpu.AllocImageBaseBuffer(*dstImage.
getImage());
270 destImgGpu.strideVar = destBufVarGpu.AllocImageBaseBuffer(*dstImage.
getVariance());
271 destImgGpu.strideMsk = destBufMskGpu.AllocImageBaseBuffer(*dstImage.
getMask());
272 if (destBufImgGpu.ptr == NULL) {
273 throw LSST_EXCEPT(afwGpu::GpuMemoryError,
"Not enough memory on GPU for output image");
275 if (destBufVarGpu.ptr == NULL) {
276 throw LSST_EXCEPT(afwGpu::GpuMemoryError,
"Not enough memory on GPU for output variance");
278 if (destBufMskGpu.ptr == NULL) {
279 throw LSST_EXCEPT(afwGpu::GpuMemoryError,
"Not enough memory on GPU for output mask");
281 destImgGpu.img = destBufImgGpu.ptr;
282 destImgGpu.var = destBufVarGpu.ptr;
283 destImgGpu.msk = destBufMskGpu.ptr;
284 destImgGpu.width = destWidth;
285 destImgGpu.height = destHeight;
287 gpu::ImageDataPtr<SrcPixelT> srcImgGpu;
288 srcImgGpu.strideImg = srcBufImgGpu.TransferFromImageBase(*srcImage.
getImage());
289 if (srcBufImgGpu.ptr == NULL) {
290 throw LSST_EXCEPT(afwGpu::GpuMemoryError,
"Not enough memory on GPU for input image");
292 srcImgGpu.strideVar = srcBufVarGpu.TransferFromImageBase(*srcImage.
getVariance());
293 if (srcBufVarGpu.ptr == NULL) {
294 throw LSST_EXCEPT(afwGpu::GpuMemoryError,
"Not enough memory on GPU for input variance");
296 srcImgGpu.strideMsk = srcBufMskGpu.TransferFromImageBase(*srcImage.
getMask());
297 if (srcBufMskGpu.ptr == NULL) {
298 throw LSST_EXCEPT(afwGpu::GpuMemoryError,
"Not enough memory on GPU for input mask");
301 srcImgGpu.img = srcBufImgGpu.ptr;
302 srcImgGpu.var = srcBufVarGpu.ptr;
303 srcImgGpu.msk = srcBufMskGpu.ptr;
304 srcImgGpu.width = srcImage.
getWidth();
307 srcPosInterpGpu.Transfer(srcPosInterp);
308 if (srcPosInterpGpu.ptr == NULL) {
310 "Not enough memory on GPU for interpolation data for coorinate transformation");
316 destImgGpu, srcImgGpu,
322 srcPosInterpGpu.ptr, interpLength
324 int numGoodPixels = NumGoodPixels(srcPosInterp, interpLength, destWidth, destHeight, srcBoxConv);
326 cudaThreadSynchronize();
327 if (cudaGetLastError() != cudaSuccess) {
328 throw LSST_EXCEPT(afwGpu::GpuRuntimeError,
"GPU calculation failed to run");
331 destBufImgGpu.CopyToImageBase(*dstImage.
getImage());
332 destBufVarGpu.CopyToImageBase(*dstImage.
getVariance());
333 destBufMskGpu.CopyToImageBase(*dstImage.
getMask());
335 return numGoodPixels;
348 int destWidth,
int destHeight)
350 int const interpBlkNX = InterpBlkN(destWidth , interpLength);
351 int const interpBlkNY = InterpBlkN(destHeight, interpLength);
353 for (
int row = -1, rowBand = 0; rowBand < interpBlkNY - 1;
row += interpLength, rowBand++) {
354 double const invInterpLen = 1.0 / interpLength;
355 double const invInterpLenRow =
row + interpLength <= destHeight - 1 ?
356 invInterpLen : 1.0 / (destHeight - 1 -
row);
358 for (
int col = -1, colBand = 0; colBand < interpBlkNX - 1;
col += interpLength, colBand++) {
360 const SPoint2 p11 = srcPosInterp.
Pixel(colBand , rowBand ).o;
361 const SPoint2 p12 = srcPosInterp.
Pixel(colBand + 1, rowBand ).o;
362 const SPoint2 p21 = srcPosInterp.
Pixel(colBand , rowBand + 1).o;
363 const SPoint2 p22 = srcPosInterp.
Pixel(colBand + 1, rowBand + 1).o;
369 double const invInterpLenCol =
col + interpLength <= destWidth - 1 ?
370 invInterpLen : 1.0 / (destWidth - 1 -
col);
372 gpu::BilinearInterp lin = srcPosInterp.
Pixel(colBand, rowBand);
373 lin.deltaY =
VecMul(band_dY , invInterpLenRow);
374 lin.d0X =
VecMul(band_d0X, invInterpLenCol);
375 lin.ddX =
VecMul(band_ddX, invInterpLenCol);
376 srcPosInterp.
Pixel(colBand, rowBand) = lin;
379 if (colBand == interpBlkNX - 2) {
380 srcPosInterp.
Pixel(interpBlkNX - 1, rowBand).deltaY =
383 if (rowBand == interpBlkNY - 2) {
384 srcPosInterp.
Pixel(colBand, interpBlkNY - 1).d0X =
394 template<
typename DestImageT,
typename SrcImageT>
396 DestImageT &destImage,
397 SrcImageT
const &srcImage,
401 int const interpLength,
402 typename DestImageT::SinglePixel padValue,
403 const bool forceProcessing
406 if (interpLength < 1) {
407 throw LSST_EXCEPT(pexExcept::InvalidParameterError,
408 "GPU accelerated warping must use interpolation");
411 int const srcWidth = srcImage.getWidth();
412 int const srcHeight = srcImage.getHeight();
413 pexLog::TTrace<3>(
"lsst.afw.math.warp",
"(GPU) source image width=%d; height=%d", srcWidth, srcHeight);
416 throw LSST_EXCEPT(afwGpu::GpuRuntimeError,
"Afw not compiled with GPU support");
422 if (dynamic_cast<afwMath::LanczosWarpingKernel const*>(&maskWarpingKernel)) {
424 }
else if (dynamic_cast<afwMath::BilinearWarpingKernel const*>(&maskWarpingKernel)) {
426 }
else if (dynamic_cast<afwMath::NearestWarpingKernel const*>(&maskWarpingKernel)) {
429 throw LSST_EXCEPT(pexExcept::InvalidParameterError,
"unknown type of mask warping kernel");
438 int const mainKernelSize = 2 * lanczosKernel.
getOrder();
445 if (!forceProcessing && interpLength < 3) {
449 int const destWidth = destImage.getWidth();
450 int const destHeight = destImage.getHeight();
451 pexLog::TTrace<3>(
"lsst.afw.math.warp",
"(GPU) remap image width=%d; height=%d", destWidth, destHeight);
453 int const maxCol = destWidth - 1;
454 int const maxRow = destHeight - 1;
461 int const interpBlkNX = InterpBlkN(destWidth , interpLength);
462 int const interpBlkNY = InterpBlkN(destHeight, interpLength);
467 for (
int rowBand = 0; rowBand < interpBlkNY; rowBand++) {
468 int row = min(maxRow, (rowBand * interpLength - 1));
469 for (
int colBand = 0; colBand < interpBlkNX; colBand++) {
470 int col = min(maxCol, (colBand * interpLength - 1));
473 sSrcPos =
MovePoint(sSrcPos,
SVec2(-srcImage.getX0(), -srcImage.getY0()));
474 srcPosInterp.
Pixel(colBand, rowBand).o = sSrcPos;
478 CalculateInterpolationData(srcPosInterp, interpLength, destWidth, destHeight);
480 int numGoodPixels = 0;
482 pexLog::TTrace<3>(
"lsst.afw.math.warp",
"using GPU acceleration, remapping masked image");
491 numGoodPixels = WarpImageGpuWrapper(destImage,
497 srcPosInterp, interpLength, padValue
507 #define MASKEDIMAGE(PIXTYPE) afwImage::MaskedImage<PIXTYPE, afwImage::MaskPixel, afwImage::VariancePixel>
508 #define IMAGE(PIXTYPE) afwImage::Image<PIXTYPE>
511 #define INSTANTIATE(DESTIMAGEPIXELT, SRCIMAGEPIXELT) \
512 template std::pair<int,WarpImageGpuStatus::ReturnCode> warpImageGPU( \
513 IMAGE(DESTIMAGEPIXELT) &destImage, \
514 IMAGE(SRCIMAGEPIXELT) const &srcImage, \
515 afwMath::LanczosWarpingKernel const &warpingKernel, \
516 afwMath::SeparableKernel const &maskWarpingKernel, \
517 PositionFunctor const &computeSrcPos, \
518 int const interpLength, \
519 IMAGE(DESTIMAGEPIXELT)::SinglePixel padValue, \
520 const bool forceProcessing); NL \
521 template std::pair<int,WarpImageGpuStatus::ReturnCode> warpImageGPU( \
522 MASKEDIMAGE(DESTIMAGEPIXELT) &destImage, \
523 MASKEDIMAGE(SRCIMAGEPIXELT) const &srcImage, \
524 afwMath::LanczosWarpingKernel const &warpingKernel, \
525 afwMath::SeparableKernel const &maskWarpingKernel, \
526 PositionFunctor const &computeSrcPos, \
527 int const interpLength, \
528 MASKEDIMAGE(DESTIMAGEPIXELT)::SinglePixel padValue, \
529 const bool forceProcessing);
GPU accelerared image warping.
Declare the Kernel class and subclasses.
Class for representing an image or 2D array in general)
SPoint2 MovePoint(SPoint2 p, SVec2 v)
Declaration of a GPU kernel for image warping and declarations of requred datatypes.
SVec2 VecMul(SVec2 v, double m)
additional GPU exceptions
int getHeight() const
Return the number of rows in the image.
#define INSTANTIATE(MATCH)
definition of the Trace messaging facilities
A kernel described by a pair of functions: func(x, y) = colFunc(x) * rowFunc(y)
bool isInsideBox(gpu::SPoint2 p)
bool TryToSelectCudaDevice(bool noExceptions, bool reselect=false)
ImagePtr getImage(bool const noThrow=false) const
Return a (Ptr to) the MaskedImage's image.
void WarpImageGpuCallKernel(bool isMaskedImage, ImageDataPtr< DestPixelT > destImageGpu, ImageDataPtr< SrcPixelT > srcImageGpu, int mainKernelSize, KernelType maskKernelType, int maskKernelSize, SBox2I srcGoodBox, PixelIVM< DestPixelT > edgePixel, BilinearInterp *srcPosInterp, int interpLength)
Calls the GPU kernel for lanczos resampling.
defines a 2D range of integer values begX <= x < endX, begY <= y < endY
int const SIZE_MAX_WARPING_KERNEL
contains GpuBuffer2D class (for simple handling of images or 2D arrays)
An integer coordinate rectangle.
VariancePtr getVariance(bool const noThrow=false) const
Return a (Ptr to) the MaskedImage's variance.
table::Key< table::Array< Kernel::Pixel > > image
int getOrder() const
get the order of the kernel
Base class to transform pixel position for a destination image to its position in the original source...
void ImageT ImageT int float saturatedPixelValue int const width
Simple 2D point (suitable for use on a GPU)
MaskPtr getMask(bool const noThrow=false) const
Return a (Ptr to) the MaskedImage's mask.
A class to manipulate images, masks, and variance as a single object.
bool isGpuBuild()
Inline function which returns true only when GPU_BUILD macro is defined.
int getHeight() const
Return the number of rows in the image.
std::pair< int, WarpImageGpuStatus::ReturnCode > warpImageGPU(DestImageT &destImage, SrcImageT const &srcImage, lsst::afw::math::LanczosWarpingKernel const &warpingKernel, lsst::afw::math::SeparableKernel const &maskWarpingKernel, PositionFunctor const &computeSrcPos, int const interpLength, typename DestImageT::SinglePixel padValue, const bool forceProcessing=true)
GPU accelerated image warping using Lanczos resampling.
ImageT::SinglePixel edgePixel(lsst::afw::image::detail::Image_tag)
Return an off-the-edge pixel appropriate for a given Image type.
void ImageT ImageT int float saturatedPixelValue int const height
Simple 2D vector (suitable for use on a GPU)
Functions to help managing setup for GPU kernels.
#define LSST_EXCEPT(type,...)
Support for warping an image to a new WCS.
PixelT & Pixel(int x, int y)
int getWidth() const
Return the number of columns in the image.
Lanczos warping: accurate but slow and can introduce ringing artifacts.
Implementation of the Class MaskedImage.
SVec2 VecSub(SVec2 a, SVec2 b)
int getWidth() const
Return the number of columns in the image.
Functions and a class to help allocating GPU global memory and transferring data to and from a GPU...
A class to represent a 2-dimensional array of pixels.
lsst::afw::geom::Box2I shrinkBBox(lsst::afw::geom::Box2I const &bbox) const
A function to determine whether compiling for GPU is enabled.