doxygen/x_11_0/_basic_convolve_g_p_u_8cc_source.html

 // -*- LSST-C++ -*-


 /*

  * LSST Data Management System

  * Copyright 2008, 2009, 2010 LSST Corporation.

  *

  * This product includes software developed by the

  * LSST Project (http://www.lsst.org/).

  *

  * This program is free software: you can redistribute it and/or modify

  * it under the terms of the GNU General Public License as published by

  * the Free Software Foundation, either version 3 of the License, or

  * (at your option) any later version.

  *

  * This program is distributed in the hope that it will be useful,

  * but WITHOUT ANY WARRANTY; without even the implied warranty of

  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

  * GNU General Public License for more details.

  *

  * You should have received a copy of the LSST License Statement and

  * the GNU General Public License along with this program.  If not,

  * see <http://www.lsstcorp.org/LegalNotices/>.

  */


 #include <algorithm>

 #include <cmath>

 #include <sstream>

 #include <vector>


 #include "boost/cstdint.hpp"


 #include "lsst/pex/exceptions.h"

 #include "lsst/pex/logging/Trace.h"

 #include "lsst/afw/image/MaskedImage.h"

 #include "lsst/afw/math/ConvolveImage.h"

 #include "lsst/afw/math/Kernel.h"

 #include "lsst/afw/math/FunctionLibrary.h"

 #include "lsst/afw/geom.h"

 #include "lsst/afw/math/detail/ConvCpuGpuShared.h"

 #include "lsst/afw/math/detail/Convolve.h"


 #include "lsst/afw/math/detail/ConvolveGPU.h"

 #include "lsst/afw/math/detail/convCUDA.h"

 #include "lsst/afw/gpu/detail/GpuBuffer2D.h"

 #include "lsst/afw/math/detail/cudaConvWrapper.h"

 #include "lsst/afw/gpu/detail/CudaSelectGpu.h"

 #include "lsst/afw/gpu/IsGpuBuild.h"

 #include "lsst/afw/gpu/GpuExceptions.h"


 namespace pexExcept = lsst::pex::exceptions;

 namespace pexLog = lsst::pex::logging;

 namespace afwGeom = lsst::afw::geom;

 namespace afwImage = lsst::afw::image;

 namespace afwMath = lsst::afw::math;

 namespace afwGpu = lsst::afw::gpu;

 namespace mathDetail = lsst::afw::math::detail;

 namespace gpuDetail = lsst::afw::gpu::detail;


 namespace {


 typedef mathDetail::VarPixel VarPixel;

 typedef mathDetail::MskPixel MskPixel;

 typedef mathDetail::KerPixel KerPixel;


 // copies data from MaskedImage to three image buffers

 template <typename PixelT>

 void CopyFromMaskedImage(afwImage::MaskedImage<PixelT, MskPixel, VarPixel> const& image,

                          gpuDetail::GpuBuffer2D<PixelT>& img,

                          gpuDetail::GpuBuffer2D<VarPixel>& var,

                          gpuDetail::GpuBuffer2D<MskPixel>& msk

                         )

 {

     int width = image.getWidth();

     int height = image.getHeight();

     img.Init(width, height);

     var.Init(width, height);

     msk.Init(width, height);


     typedef typename afwImage::MaskedImage<PixelT, MskPixel, VarPixel>

     ::x_iterator x_iterator;


     //copy input image data to buffer

     for (int i = 0; i < height; ++i) {

         x_iterator inPtr = image.x_at(0, i);

         PixelT*    imageDataPtr = img.GetImgLinePtr(i);

         MskPixel*  imageMaskPtr = msk.GetImgLinePtr(i);

         VarPixel*  imageVarPtr  = var.GetImgLinePtr(i);


         for (x_iterator cnvEnd = inPtr + width; inPtr != cnvEnd;

                 ++inPtr, ++imageDataPtr, ++imageMaskPtr, ++imageVarPtr ) {

             *imageDataPtr = (*inPtr).image();

             *imageMaskPtr = (*inPtr).mask();

             *imageVarPtr  = (*inPtr).variance();

         }

     }

 }


 // copies data from three image buffers to MaskedImage

 template <typename PixelT>

 void CopyToImage(afwImage::MaskedImage<PixelT, MskPixel, VarPixel>& outImage,

                  int startX, int startY,

                  const gpuDetail::GpuBuffer2D<PixelT>& img,

                  const gpuDetail::GpuBuffer2D<VarPixel>& var,

                  const gpuDetail::GpuBuffer2D<MskPixel>& msk

                 )

 {

     assert(img.height == var.height);

     assert(img.height == msk.height);

     assert(img.width == var.width);

     assert(img.width == msk.width);


     typedef typename afwImage::MaskedImage<PixelT, MskPixel, VarPixel>

     ::x_iterator x_iterator;


     for (int i = 0; i < img.height; ++i) {

         const PixelT*    outPtrImg = img.GetImgLinePtr(i);

         const MskPixel*  outPtrMsk = msk.GetImgLinePtr(i);

         const VarPixel*  outPtrVar = var.GetImgLinePtr(i);


         for (x_iterator cnvPtr = outImage.x_at(startX, i + startY),

                 cnvEnd = cnvPtr + img.width;    cnvPtr != cnvEnd;    ++cnvPtr )

         {

             *cnvPtr = typename x_iterator::type(*outPtrImg, *outPtrMsk, *outPtrVar);

             ++outPtrImg;

             ++outPtrMsk;

             ++outPtrVar;

         }

     }

 }


 }   // anonymous namespace


 template <typename OutImageT, typename InImageT>

 mathDetail::ConvolveGpuStatus::ReturnCode mathDetail::basicConvolveGPU(

     OutImageT &convolvedImage,

     InImageT const& inImage,

     afwMath::Kernel const& kernel,

     afwMath::ConvolutionControl const& convolutionControl)

 {

     if (!afwGpu::isGpuBuild()) {

         throw LSST_EXCEPT(afwGpu::GpuRuntimeError, "Afw not compiled with GPU support");

     }


     // Because convolve isn't a method of Kernel we can't always use Kernel's vtbl to dynamically

     // dispatch the correct version of basicConvolve. The case that fails is convolving with a kernel

     // obtained from a pointer or reference to a Kernel (base class), e.g. as used in LinearCombinationKernel.

     if (IS_INSTANCE(kernel, afwMath::DeltaFunctionKernel)) {

         return mathDetail::ConvolveGpuStatus::UNSUPPORTED_KERNEL;

     } else if (IS_INSTANCE(kernel, afwMath::SeparableKernel)) {

         return mathDetail::ConvolveGpuStatus::UNSUPPORTED_KERNEL;

     } else if (IS_INSTANCE(kernel, afwMath::LinearCombinationKernel) && kernel.isSpatiallyVarying()) {

         pexLog::TTrace<4>("lsst.afw.math.convolve",

                           "generic basicConvolve (GPU): dispatch to convolveLinearCombinationGPU");

         return mathDetail::convolveLinearCombinationGPU(convolvedImage, inImage,

                 *dynamic_cast<afwMath::LinearCombinationKernel const*>(&kernel),

                 convolutionControl);

     }


     // use brute force

     pexLog::TTrace<3>("lsst.afw.math.convolve",

                       "generic basicConvolve (GPU): dispatch to convolveSpatiallyInvariantGPU");

     return mathDetail::convolveSpatiallyInvariantGPU(convolvedImage, inImage, kernel, convolutionControl);

 }


 template <typename OutPixelT, typename InPixelT>

 mathDetail::ConvolveGpuStatus::ReturnCode mathDetail::convolveLinearCombinationGPU(

     afwImage::MaskedImage<OutPixelT, MskPixel, VarPixel>& convolvedImage,

     afwImage::MaskedImage<InPixelT , MskPixel, VarPixel> const& inImage,

     afwMath::LinearCombinationKernel const& kernel,

     afwMath::ConvolutionControl const & convolutionControl)

 {

     if (!afwGpu::isGpuBuild()) {

         throw LSST_EXCEPT(afwGpu::GpuRuntimeError, "Afw not compiled with GPU support");

     }

     typedef typename afwMath::Kernel::Pixel KernelPixel;

     typedef afwImage::Image<KernelPixel> KernelImage;

     typedef gpuDetail::GpuBuffer2D<KernelPixel> KernelBuffer;


     if (!kernel.isSpatiallyVarying()) {

         // use the standard algorithm for the spatially invariant case

         pexLog::TTrace<3>("lsst.afw.math.convolve",

                           "convolveLinearCombinationGPU: spatially invariant; will delegate");

         return mathDetail::convolveSpatiallyInvariantGPU(convolvedImage, inImage, kernel,

                 convolutionControl.getDoNormalize());

     } else {

         bool throwExceptionsOn=convolutionControl.getDevicePreference() == afwGpu::USE_GPU;

         if (afwGpu::detail::TryToSelectCudaDevice(!throwExceptionsOn) == false){

             return mathDetail::ConvolveGpuStatus::NO_GPU;

         }


         // refactor the kernel if this is reasonable and possible;

         // then use the standard algorithm for the spatially varying case

         afwMath::Kernel::Ptr refKernelPtr; // possibly refactored version of kernel

         if (static_cast<int>(kernel.getNKernelParameters()) > kernel.getNSpatialParameters()) {

             // refactoring will speed convolution, so try it

             refKernelPtr = kernel.refactor();

             if (!refKernelPtr) {

                 refKernelPtr = kernel.clone();

             }

         } else {

             // too few basis kernels for refactoring to be worthwhile

             refKernelPtr = kernel.clone();

         }

         assertDimensionsOK(convolvedImage, inImage, kernel);


         const afwMath::LinearCombinationKernel* newKernel =

             dynamic_cast<afwMath::LinearCombinationKernel*> (refKernelPtr.get());

         assert(newKernel!=NULL);


         const int kernelN = newKernel->getNBasisKernels();

         const std::vector< afwMath::Kernel::SpatialFunctionPtr > sFn = newKernel->getSpatialFunctionList();

         if (sFn.size() < 1) {

             return mathDetail::ConvolveGpuStatus::SFN_COUNT_ERROR;

         }

         if (int(sFn.size()) != kernelN) {

             return mathDetail::ConvolveGpuStatus::SFN_COUNT_ERROR;

         }

         bool isAllCheby = true;

         for (int i = 0; i < kernelN; i++) {

             if (! IS_INSTANCE( *sFn[i], afwMath::Chebyshev1Function2<double> ) ) {

                 isAllCheby = false;

             }

         }

         bool isAllPoly = true;

         for (int i = 0; i < kernelN; i++) {

             if (! IS_INSTANCE( *sFn[i], afwMath::PolynomialFunction2<double> ) ) {

                 isAllPoly = false;

             }

         }


         int order = 0;

 #ifdef GPU_BUILD

         SpatialFunctionType_t sfType;

 #endif

         if (isAllPoly) {

             order = dynamic_cast<const afwMath::PolynomialFunction2<double>*>( sFn[0].get() ) ->getOrder();

 #ifdef GPU_BUILD

             sfType = sftPolynomial;

 #endif

         } else if(isAllCheby) {

             order = dynamic_cast<const afwMath::Chebyshev1Function2<double>*>( sFn[0].get() ) ->getOrder();

 #ifdef GPU_BUILD

             sfType = sftChebyshev;

 #endif

         } else

             return mathDetail::ConvolveGpuStatus::SFN_TYPE_ERROR;


         //get copies of basis kernels

         const afwMath::KernelList kernelList = newKernel->getKernelList();


         //if kernel is too small, call CPU convolution

         const int minKernelSize = 25;

         if (newKernel->getWidth() * newKernel->getHeight() < minKernelSize &&

                 convolutionControl.getDevicePreference() != lsst::afw::gpu::USE_GPU) {

             return mathDetail::ConvolveGpuStatus::KERNEL_TOO_SMALL;

         }


         //if something is wrong, call CPU convolution

         const bool shMemOkA = IsSufficientSharedMemoryAvailable_ForImgAndMaskBlock(

                                   newKernel->getWidth(), newKernel->getHeight(), sizeof(double));

         const bool shMemOkB = IsSufficientSharedMemoryAvailable_ForSfn(order, kernelN);

         if (!shMemOkA || !shMemOkB) {

             //cannot fit kernels into shared memory, revert to convolution by CPU

             return mathDetail::ConvolveGpuStatus::KERNEL_TOO_BIG;

         }


         if (kernelN == 0 || kernelN > detail::gpu::maxGpuSfCount) {

             return mathDetail::ConvolveGpuStatus::KERNEL_COUNT_ERROR;

         }

         for (int i = 0; i < kernelN; i++) {

             if (kernelList[i]->getDimensions() != newKernel->getDimensions()

                     || kernelList[i]->getCtr() != newKernel->getCtr()

                ) {

                 return mathDetail::ConvolveGpuStatus::INVALID_KERNEL_DATA;

             }

         }

         pexLog::TTrace<3>("lsst.afw.math.convolve",

                 "MaskedImage, convolveLinearCombinationGPU: will use GPU acceleration");


         std::vector< KernelBuffer >  basisKernels(kernelN);

         for (int i = 0; i < kernelN; i++) {

             KernelImage kernelImage(kernelList[i]->getDimensions());

             (void)kernelList[i]->computeImage(kernelImage, false);

             basisKernels[i].Init(kernelImage);

         }


         int const inImageWidth = inImage.getWidth();

         int const inImageHeight = inImage.getHeight();

         int const cnvWidth = inImageWidth + 1 - newKernel->getWidth();

         int const cnvHeight = inImageHeight + 1 - newKernel->getHeight();

         int const cnvStartX = newKernel->getCtrX();

         int const cnvStartY = newKernel->getCtrY();


         std::vector<double> colPos(cnvWidth);

         std::vector<double> rowPos(cnvHeight);


         for (int i = 0; i < cnvWidth; i++) {

             colPos[i] = inImage.indexToPosition(i + cnvStartX, afwImage::X);

         }

         for (int i = 0; i < cnvHeight; i++) {

             rowPos[i] = inImage.indexToPosition(i + cnvStartY, afwImage::Y);

         }

         gpuDetail::GpuBuffer2D<InPixelT>  inBufImg;

         gpuDetail::GpuBuffer2D<VarPixel>  inBufVar;

         gpuDetail::GpuBuffer2D<MskPixel>  inBufMsk;


         CopyFromMaskedImage(inImage, inBufImg, inBufVar, inBufMsk);


         gpuDetail::GpuBuffer2D<OutPixelT> outBufImg(cnvWidth, cnvHeight);

         gpuDetail::GpuBuffer2D<VarPixel>  outBufVar(cnvWidth, cnvHeight);

         gpuDetail::GpuBuffer2D<MskPixel>  outBufMsk(cnvWidth, cnvHeight);


         pexLog::TTrace<3>("lsst.afw.math.convolve",

                 "MaskedImage, convolveLinearCombinationGPU: will use GPU acceleration");


 #ifdef GPU_BUILD

         GPU_ConvolutionMI_LinearCombinationKernel<OutPixelT, InPixelT>(

             inBufImg, inBufVar, inBufMsk,

             colPos, rowPos,

             sFn,

             outBufImg, outBufVar, outBufMsk,

             basisKernels,

             sfType,

             convolutionControl.getDoNormalize()

         );

 #endif


         CopyToImage(convolvedImage, cnvStartX, cnvStartY,

                     outBufImg, outBufVar, outBufMsk);

     }

     return mathDetail::ConvolveGpuStatus::OK;

 }


 template <typename OutPixelT, typename InPixelT>

 mathDetail::ConvolveGpuStatus::ReturnCode mathDetail::convolveLinearCombinationGPU(

     afwImage::Image<OutPixelT>& convolvedImage,

     afwImage::Image<InPixelT > const& inImage,

     afwMath::LinearCombinationKernel const& kernel,

     afwMath::ConvolutionControl const & convolutionControl)

 {

     if (!afwGpu::isGpuBuild()) {

         throw LSST_EXCEPT(afwGpu::GpuRuntimeError, "Afw not compiled with GPU support");

     }

     typedef typename afwMath::Kernel::Pixel KernelPixel;

     typedef afwImage::Image<KernelPixel> KernelImage;

     typedef gpuDetail::GpuBuffer2D<KernelPixel> KernelBuffer;


     if (!kernel.isSpatiallyVarying()) {

         // use the standard algorithm for the spatially invariant case

         pexLog::TTrace<3>("lsst.afw.math.convolve",

                           "convolveLinearCombinationGPU: spatially invariant; delegate");

         return mathDetail::convolveSpatiallyInvariantGPU(convolvedImage, inImage, kernel,

                 convolutionControl.getDoNormalize());

     } else {

         bool throwExceptionsOn=convolutionControl.getDevicePreference() == afwGpu::USE_GPU;

         if (afwGpu::detail::TryToSelectCudaDevice(!throwExceptionsOn) == false){

             return mathDetail::ConvolveGpuStatus::NO_GPU;

         }


         // refactor the kernel if this is reasonable and possible;

         // then use the standard algorithm for the spatially varying case

         afwMath::Kernel::Ptr refKernelPtr; // possibly refactored version of kernel

         if (static_cast<int>(kernel.getNKernelParameters()) > kernel.getNSpatialParameters()) {

             // refactoring will speed convolution, so try it

             refKernelPtr = kernel.refactor();

             if (!refKernelPtr) {

                 refKernelPtr = kernel.clone();

             }

         } else {

             // too few basis kernels for refactoring to be worthwhile

             refKernelPtr = kernel.clone();

         }


         {

             assertDimensionsOK(convolvedImage, inImage, kernel);


             const afwMath::LinearCombinationKernel* newKernel =

                 dynamic_cast<afwMath::LinearCombinationKernel*> (refKernelPtr.get());

             assert(newKernel!=NULL);


             const int kernelN = newKernel->getNBasisKernels();

             const std::vector< afwMath::Kernel::SpatialFunctionPtr > sFn = newKernel->getSpatialFunctionList();

             if (sFn.size() < 1) {

                 return mathDetail::ConvolveGpuStatus::SFN_COUNT_ERROR;

             }

             if (int(sFn.size()) != kernelN) {

                 return mathDetail::ConvolveGpuStatus::SFN_COUNT_ERROR;

             }


             bool isAllCheby = true;

             for (int i = 0; i < kernelN; i++) {

                 if (! IS_INSTANCE( *sFn[i], afwMath::Chebyshev1Function2<double> ) ) {

                     isAllCheby = false;

                 }

             }

             bool isAllPoly = true;

             for (int i = 0; i < kernelN; i++) {

                 if (! IS_INSTANCE( *sFn[i], afwMath::PolynomialFunction2<double> ) ) {

                     isAllPoly = false;

                 }

             }

             if (!isAllPoly && !isAllCheby) {

                 return mathDetail::ConvolveGpuStatus::SFN_TYPE_ERROR;

             }


             int order = 0;

 #ifdef GPU_BUILD

             SpatialFunctionType_t sfType;

 #endif

             if (isAllPoly) {

                 order = dynamic_cast<const afwMath::PolynomialFunction2<double>*>( sFn[0].get() ) ->getOrder();

 #ifdef GPU_BUILD

                 sfType = sftPolynomial;

 #endif

             } else if(isAllCheby) {

                 order = dynamic_cast<const afwMath::Chebyshev1Function2<double>*>( sFn[0].get() ) ->getOrder();

 #ifdef GPU_BUILD

                 sfType = sftChebyshev;

 #endif

             } else {

                 return mathDetail::ConvolveGpuStatus::SFN_TYPE_ERROR;

             }

             //get copies of basis kernels

             const afwMath::KernelList kernelList = newKernel->getKernelList();


             //if kernel is too small, call CPU convolution

             const int minKernelSize = 20;

             if (newKernel->getWidth() * newKernel->getHeight() < minKernelSize &&

                     convolutionControl.getDevicePreference() != lsst::afw::gpu::USE_GPU) {

                 return mathDetail::ConvolveGpuStatus::KERNEL_TOO_SMALL;

             }


             //if something is wrong, call CPU convolution

             const bool shMemOkA = IsSufficientSharedMemoryAvailable_ForImgBlock(

                                       newKernel->getWidth(), newKernel->getHeight(), sizeof(double));

             const bool shMemOkB = IsSufficientSharedMemoryAvailable_ForSfn(order, kernelN);

             if (!shMemOkA || !shMemOkB) {

                 //cannot fit kernels into shared memory, revert to convolution by CPU

                 return mathDetail::ConvolveGpuStatus::KERNEL_TOO_BIG;

             }


             if (kernelN == 0) {

                 return mathDetail::ConvolveGpuStatus::KERNEL_COUNT_ERROR;

             }


             for (int i = 0; i < kernelN; i++) {

                 if (kernelList[i]->getDimensions() != newKernel->getDimensions()

                         || kernelList[i]->getCtr() != newKernel->getCtr()

                    ) {

                     return mathDetail::ConvolveGpuStatus::INVALID_KERNEL_DATA;

                 }

             }


             std::vector< KernelBuffer >  basisKernels(kernelN);

             for (int i = 0; i < kernelN; i++) {

                 KernelImage kernelImage(kernelList[i]->getDimensions());

                 (void)kernelList[i]->computeImage(kernelImage, false);

                 basisKernels[i].Init(kernelImage);

             }


             int const inImageWidth = inImage.getWidth();

             int const inImageHeight = inImage.getHeight();

             int const cnvWidth = inImageWidth + 1 - newKernel->getWidth();

             int const cnvHeight = inImageHeight + 1 - newKernel->getHeight();

             int const cnvStartX = newKernel->getCtrX();

             int const cnvStartY = newKernel->getCtrY();


             std::vector<double> colPos(cnvWidth);

             std::vector<double> rowPos(cnvHeight);


             for (int i = 0; i < cnvWidth; i++) {

                 colPos[i] = inImage.indexToPosition(i + cnvStartX, afwImage::X);

             }

             for (int i = 0; i < cnvHeight; i++) {

                 rowPos[i] = inImage.indexToPosition(i + cnvStartY, afwImage::Y);

             }

             gpuDetail::GpuBuffer2D<InPixelT>  inBuf(inImage);

             gpuDetail::GpuBuffer2D<OutPixelT> outBuf(cnvWidth, cnvHeight);


             pexLog::TTrace<3>("lsst.afw.math.convolve",

                 "plain Image, convolveLinearCombinationGPU: will use GPU acceleration");


 #ifdef GPU_BUILD

             GPU_ConvolutionImage_LinearCombinationKernel<OutPixelT, InPixelT>(

                 inBuf, colPos, rowPos,

                 sFn,

                 outBuf,

                 basisKernels,

                 sfType,

                 convolutionControl.getDoNormalize()

             );

 #endif


             outBuf.CopyToImage(convolvedImage, cnvStartX, cnvStartY);

         }

     }

     return mathDetail::ConvolveGpuStatus::OK;

 }


 template <typename OutPixelT, typename InPixelT>

 mathDetail::ConvolveGpuStatus::ReturnCode mathDetail::convolveSpatiallyInvariantGPU(

     afwImage::Image<OutPixelT>& convolvedImage,

     afwImage::Image<InPixelT > const& inImage,

     afwMath::Kernel const& kernel,

     afwMath::ConvolutionControl const & convolutionControl)

 {

     if (!afwGpu::isGpuBuild()) {

         throw LSST_EXCEPT(afwGpu::GpuRuntimeError, "Afw not compiled with GPU support");

     }

     const bool doNormalize = convolutionControl.getDoNormalize();


     const bool throwExceptionsOn=convolutionControl.getDevicePreference() == afwGpu::USE_GPU;

     if (afwGpu::detail::TryToSelectCudaDevice(!throwExceptionsOn) == false){

         return mathDetail::ConvolveGpuStatus::NO_GPU;

     }


     typedef typename afwMath::Kernel::Pixel KernelPixel;

     typedef afwImage::Image<KernelPixel> KernelImage;

     typedef typename KernelImage::const_x_iterator KernelXIterator;

     typedef typename KernelImage::const_xy_locator KernelXYLocator;


     if (kernel.isSpatiallyVarying()) {

         return mathDetail::ConvolveGpuStatus::UNSUPPORTED_KERNEL;

     }


     assertDimensionsOK(convolvedImage, inImage, kernel);


     const int minKernelSize = 25;


     int const inImageWidth = inImage.getWidth();

     int const inImageHeight = inImage.getHeight();

     int const kWidth = kernel.getWidth();

     int const kHeight = kernel.getHeight();

     int const cnvWidth = inImageWidth + 1 - kernel.getWidth();

     int const cnvHeight = inImageHeight + 1 - kernel.getHeight();

     int const cnvStartX = kernel.getCtrX();

     int const cnvStartY = kernel.getCtrY();


     KernelImage kernelImage(kernel.getDimensions());


     pexLog::TTrace<3>("lsst.afw.math.convolve",

                       "convolveSpatiallyInvariantGPU: using GPU acceleration, "

                       "plain Image, kernel is spatially invariant");

     (void)kernel.computeImage(kernelImage, doNormalize);


     typedef afwImage::Image<InPixelT  > InImageT;

     typedef afwImage::Image<OutPixelT > OutImageT;


     const bool shMemOk = IsSufficientSharedMemoryAvailable_ForImgBlock(kWidth, kHeight, sizeof(double));

     if (!shMemOk) {

         //cannot fit kernels into shared memory, revert to convolution by CPU

         return mathDetail::ConvolveGpuStatus::KERNEL_TOO_BIG;

     }

     //if kernel is too small, call CPU convolution

     if (kWidth * kHeight < minKernelSize &&

             convolutionControl.getDevicePreference() != lsst::afw::gpu::USE_GPU) {

         return mathDetail::ConvolveGpuStatus::KERNEL_TOO_SMALL;

     }


     gpuDetail::GpuBuffer2D<InPixelT>  inBuf(inImage);

     gpuDetail::GpuBuffer2D<OutPixelT> outBuf(cnvWidth, cnvHeight);

     gpuDetail::GpuBuffer2D<KernelPixel> kernelBuf(kernelImage);


 #ifdef GPU_BUILD

     GPU_ConvolutionImage_SpatiallyInvariantKernel<OutPixelT, InPixelT>(inBuf, outBuf, kernelBuf);

 #endif

     outBuf.CopyToImage(convolvedImage, cnvStartX, cnvStartY);

     return mathDetail::ConvolveGpuStatus::OK;

 }


 template <typename OutPixelT, typename InPixelT>

 mathDetail::ConvolveGpuStatus::ReturnCode mathDetail::convolveSpatiallyInvariantGPU(

     afwImage::MaskedImage<OutPixelT, MskPixel, VarPixel>& convolvedImage,

     afwImage::MaskedImage<InPixelT , MskPixel, VarPixel> const& inImage,

     afwMath::Kernel const& kernel,

     afwMath::ConvolutionControl const & convolutionControl)

 {

     if (!afwGpu::isGpuBuild()) {

         throw LSST_EXCEPT(afwGpu::GpuRuntimeError, "Afw not compiled with GPU support");

     }

     bool doNormalize = convolutionControl.getDoNormalize();


     typedef afwImage::MaskedImage<InPixelT  > InImageT;

     typedef afwImage::MaskedImage<OutPixelT > OutImageT;

     typedef typename afwMath::Kernel::Pixel KernelPixel;

     typedef afwImage::Image<KernelPixel> KernelImage;

     typedef typename KernelImage::const_x_iterator KernelXIterator;

     typedef typename KernelImage::const_xy_locator KernelXYLocator;


     if (kernel.isSpatiallyVarying()) {

         return mathDetail::ConvolveGpuStatus::UNSUPPORTED_KERNEL;

     }


     assertDimensionsOK(convolvedImage, inImage, kernel);


     const int minKernelSize = 20;


     int const inImageWidth = inImage.getWidth();

     int const inImageHeight = inImage.getHeight();

     int const kWidth = kernel.getWidth();

     int const kHeight = kernel.getHeight();

     int const cnvWidth = inImageWidth + 1 - kernel.getWidth();

     int const cnvHeight = inImageHeight + 1 - kernel.getHeight();

     int const cnvStartX = kernel.getCtrX();

     int const cnvStartY = kernel.getCtrY();


     const bool throwExceptionsOn=convolutionControl.getDevicePreference() == afwGpu::USE_GPU;

     if (afwGpu::detail::TryToSelectCudaDevice(!throwExceptionsOn) == false){

         return mathDetail::ConvolveGpuStatus::NO_GPU;

     }


     const bool shMemOk = IsSufficientSharedMemoryAvailable_ForImgAndMaskBlock(kWidth, kHeight, sizeof(double));

     if (!shMemOk) {

         //cannot fit kernels into shared memory, revert to convolution by CPU

         return mathDetail::ConvolveGpuStatus::KERNEL_TOO_BIG;

     }


     //if kernel is too small, call CPU convolution

     if (kWidth * kHeight < minKernelSize

             && convolutionControl.getDevicePreference() != lsst::afw::gpu::USE_GPU) {

         return mathDetail::ConvolveGpuStatus::KERNEL_TOO_SMALL;

     }


     KernelImage kernelImage(kernel.getDimensions());


     pexLog::TTrace<3>("lsst.afw.math.convolve",

                       "convolveSpatiallyInvariantGPU: using GPU acceleration, "

                       "MaskedImage, kernel is spatially invariant");

     (void)kernel.computeImage(kernelImage, doNormalize);


     gpuDetail::GpuBuffer2D<InPixelT>  inBufImg;

     gpuDetail::GpuBuffer2D<VarPixel>  inBufVar;

     gpuDetail::GpuBuffer2D<MskPixel>  inBufMsk;

     CopyFromMaskedImage(inImage, inBufImg, inBufVar, inBufMsk);


     gpuDetail::GpuBuffer2D<OutPixelT> outBufImg(cnvWidth, cnvHeight);

     gpuDetail::GpuBuffer2D<VarPixel>  outBufVar(cnvWidth, cnvHeight);

     gpuDetail::GpuBuffer2D<MskPixel>  outBufMsk(cnvWidth, cnvHeight);


     gpuDetail::GpuBuffer2D<KernelPixel> kernelBuf(kernelImage);

 #ifdef GPU_BUILD

     GPU_ConvolutionMI_SpatiallyInvariantKernel<OutPixelT, InPixelT>(

         inBufImg, inBufVar, inBufMsk,

         outBufImg, outBufVar, outBufMsk,

         kernelBuf

     );

 #endif

     CopyToImage(convolvedImage, cnvStartX, cnvStartY,

                 outBufImg, outBufVar, outBufMsk);

     return mathDetail::ConvolveGpuStatus::OK;

 }


 /*

  * Explicit instantiation

  */

 #define IMAGE(PIXTYPE) afwImage::Image<PIXTYPE>

 #define MASKEDIMAGE(PIXTYPE) afwImage::MaskedImage<PIXTYPE, afwImage::MaskPixel, afwImage::VariancePixel>

 #define NL /* */

 // Instantiate Image or MaskedImage versions

 #define INSTANTIATE_IM_OR_MI(IMGMACRO, OUTPIXTYPE, INPIXTYPE) \

     template mathDetail::ConvolveGpuStatus::ReturnCode mathDetail::basicConvolveGPU( \

         IMGMACRO(OUTPIXTYPE)&, IMGMACRO(INPIXTYPE) const&, afwMath::Kernel const&, \

             afwMath::ConvolutionControl const&); NL \

     template mathDetail::ConvolveGpuStatus::ReturnCode mathDetail::convolveLinearCombinationGPU( \

         IMGMACRO(OUTPIXTYPE)&, IMGMACRO(INPIXTYPE) const&, afwMath::LinearCombinationKernel const&, \

             afwMath::ConvolutionControl const&); NL \

     template mathDetail::ConvolveGpuStatus::ReturnCode mathDetail::convolveSpatiallyInvariantGPU( \

         IMGMACRO(OUTPIXTYPE)&, IMGMACRO(INPIXTYPE) const&, afwMath::Kernel const&, \

             afwMath::ConvolutionControl const&);

 // Instantiate both Image and MaskedImage versions

 #define INSTANTIATE(OUTPIXTYPE, INPIXTYPE) \

     INSTANTIATE_IM_OR_MI(IMAGE,       OUTPIXTYPE, INPIXTYPE) \

     INSTANTIATE_IM_OR_MI(MASKEDIMAGE, OUTPIXTYPE, INPIXTYPE)


 INSTANTIATE(double, double)

 INSTANTIATE(double, float)

 INSTANTIATE(double, int)

 INSTANTIATE(double, boost::uint16_t)

 INSTANTIATE(float, float)

 INSTANTIATE(float, int)

 INSTANTIATE(float, boost::uint16_t)

 INSTANTIATE(int, int)

 INSTANTIATE(boost::uint16_t, boost::uint16_t)


lsst.afw.math::Chebyshev1Function2
2-dimensional weighted sum of Chebyshev polynomials of the first kind.
Definition: FunctionLibrary.h:823

Convolve.h
Convolution support.

lsst.afw.image::MaskedImage::getWidth
int getWidth() const
Return the number of columns in the image.
Definition: MaskedImage.h:901

geom.h
An include file to include the header files for lsst::afw::geom.

Kernel.h
Declare the Kernel class and subclasses.

lsst.afw.gpu::detail::GpuBuffer2D
Class for representing an image or 2D array in general)
Definition: GpuBuffer2D.h:54

lsst.afw.math.detail::convolveLinearCombinationGPU
ConvolveGpuStatus::ReturnCode convolveLinearCombinationGPU(lsst::afw::image::MaskedImage< OutPixelT, lsst::afw::image::MaskPixel, lsst::afw::image::VariancePixel > &convolvedImage, lsst::afw::image::MaskedImage< InPixelT, lsst::afw::image::MaskPixel, lsst::afw::image::VariancePixel > const &inImage, lsst::afw::math::LinearCombinationKernel const &kernel, lsst::afw::math::ConvolutionControl const &convolutionControl)

lsst.afw.math::Kernel::getWidth
int getWidth() const
Return the Kernel&#39;s width.
Definition: Kernel.h:240

lsst.afw.math.detail::IsSufficientSharedMemoryAvailable_ForImgBlock
bool IsSufficientSharedMemoryAvailable_ForImgBlock(int filterW, int filterH, int pixSize)
Definition: cudaConvWrapper.cc:51

lsst.afw.math::LinearCombinationKernel::getKernelList
virtual KernelList const & getKernelList() const
Get the fixed basis kernels.
Definition: LinearCombinationKernel.cc:144

lsst::ip::diffim::detail::PixelT
float PixelT
Definition: AssessSpatialKernelVisitor.cc:209

lsst.afw.image::ImageBase::indexToPosition
double indexToPosition(double ind, lsst::afw::image::xOrY const xy) const
Convert image index to image position.
Definition: Image.h:290

lsst.afw.math::Kernel::Pixel
double Pixel
Definition: Kernel.h:140

lsst.afw.math::Kernel::getHeight
int getHeight() const
Return the Kernel&#39;s height.
Definition: Kernel.h:247

lsst.afw.math::LinearCombinationKernel::clone
virtual boost::shared_ptr< Kernel > clone() const
Return a pointer to a deep copy of this kernel.
Definition: LinearCombinationKernel.cc:109

GpuExceptions.h
additional GPU exceptions

lsst.afw.gpu::USE_GPU
Definition: DevicePreference.h:61

lsst.afw.math::ConvolutionControl
Parameters to control convolution.
Definition: ConvolveImage.h:58

lsst.afw.image::MaskedImage::indexToPosition
double indexToPosition(double ind, lsst::afw::image::xOrY const xy) const
Convert image index to image position (see Image::indexToPosition)
Definition: MaskedImage.h:971

lsst.afw.math::PolynomialFunction2
2-dimensional polynomial function with cross terms
Definition: FunctionLibrary.h:524

Trace.h
definition of the Trace messaging facilities

lsst.afw.math::ConvolutionControl::getDoNormalize
bool getDoNormalize() const
Definition: ConvolveImage.h:76

lsst.afw.math::SeparableKernel
A kernel described by a pair of functions: func(x, y) = colFunc(x) * rowFunc(y)
Definition: Kernel.h:986

lsst.afw.math::Kernel::getSpatialFunctionList
std::vector< SpatialFunctionPtr > getSpatialFunctionList() const
Return a list of clones of the spatial functions.
Definition: Kernel.cc:186

lsst.afw.math::Kernel::getNKernelParameters
unsigned int getNKernelParameters() const
Return the number of kernel parameters (0 if none)
Definition: Kernel.h:289

FunctionLibrary.h
Define a collection of useful Functions.

lsst.afw.math::Kernel::getNSpatialParameters
int getNSpatialParameters() const
Return the number of spatial parameters (0 if not spatially varying)
Definition: Kernel.h:296

lsst.afw.gpu::detail::TryToSelectCudaDevice
bool TryToSelectCudaDevice(bool noExceptions, bool reselect=false)
Definition: CudaSelectGpu.cc:74

lsst.afw.math.detail::ConvolveGpuStatus::NO_GPU
Definition: ConvolveGPU.h:57

lsst.afw.math.detail::ConvolveGpuStatus::INVALID_KERNEL_DATA
Definition: ConvolveGPU.h:58

lsst.afw.math::Kernel::getCtr
lsst::afw::geom::Point2I getCtr() const
Return index of kernel&#39;s center.
Definition: Kernel.h:254

lsst.afw.math.detail::SpatialFunctionType_t
SpatialFunctionType_t
Definition: cudaConvWrapper.h:45

ConvCpuGpuShared.h
CPU and GPU convolution shared code.

lsst.afw.math.detail::convolveSpatiallyInvariantGPU
ConvolveGpuStatus::ReturnCode convolveSpatiallyInvariantGPU(lsst::afw::image::MaskedImage< OutPixelT, lsst::afw::image::MaskPixel, lsst::afw::image::VariancePixel > &convolvedImage, lsst::afw::image::MaskedImage< InPixelT, lsst::afw::image::MaskPixel, lsst::afw::image::VariancePixel > const &inImage, lsst::afw::math::Kernel const &kernel, lsst::afw::math::ConvolutionControl const &convolutionControl)

GpuBuffer2D.h
contains GpuBuffer2D class (for simple handling of images or 2D arrays)

lsst.afw.math.detail::VarPixel
lsst::afw::image::VariancePixel VarPixel
Definition: convCUDA.h:44

lsst.afw.math::Kernel::isSpatiallyVarying
bool isSpatiallyVarying() const
Return true iff the kernel is spatially varying (has a spatial function)
Definition: Kernel.h:402

lsst.afw.math::Kernel::Ptr
boost::shared_ptr< Kernel > Ptr
Definition: Kernel.h:141

lsst.afw.gpu::detail::GpuBuffer2D::GetImgLinePtr
PixelT * GetImgLinePtr(int y)
Definition: GpuBuffer2D.h:123

image
table::Key< table::Array< Kernel::Pixel > > image
Definition: FixedKernel.cc:117

lsst.afw.math.detail::ConvolveGpuStatus::SFN_TYPE_ERROR
Definition: ConvolveGPU.h:59

lsst.afw.image::MaskedImage::x_at
x_iterator x_at(int x, int y) const
Return an x_iterator at the point (x, y)
Definition: MaskedImage.h:1006

lsst.afw.image::ImageBase::getWidth
int getWidth() const
Return the number of columns in the image.
Definition: Image.h:237

convCUDA.h
GPU convolution code.

cudaConvWrapper.h
Set up for convolution, calls GPU convolution kernels.

lsst.afw.math.detail::ConvolveGpuStatus::ReturnCode
ReturnCode
Definition: ConvolveGPU.h:57

lsst.afw.gpu::detail::GpuBuffer2D::height
int height
Definition: GpuBuffer2D.h:61

lsst.afw.math.detail::ConvolveGpuStatus::KERNEL_COUNT_ERROR
Definition: ConvolveGPU.h:58

lsst.afw.math::LinearCombinationKernel
A kernel that is a linear combination of fixed basis kernels.
Definition: Kernel.h:814

lsst.afw.image::MaskedImage
A class to manipulate images, masks, and variance as a single object.
Definition: MaskedImage.h:77

lsst.afw.math::Kernel::getDimensions
geom::Extent2I const getDimensions() const
Return the Kernel&#39;s dimensions (width, height)
Definition: Kernel.h:226

lsst.afw.gpu::isGpuBuild
bool isGpuBuild()
Inline function which returns true only when GPU_BUILD macro is defined.
Definition: IsGpuBuild.h:45

lsst.afw.math.detail::IsSufficientSharedMemoryAvailable_ForSfn
bool IsSufficientSharedMemoryAvailable_ForSfn(int order, int kernelN)
Definition: cudaConvWrapper.cc:59

lsst.afw.math.detail::ConvolveGpuStatus::UNSUPPORTED_KERNEL
Definition: ConvolveGPU.h:58

lsst.afw.math.detail::gpu::maxGpuSfCount
const int maxGpuSfCount
Definition: convCUDA.h:50

lsst.afw.math::LinearCombinationKernel::refactor
boost::shared_ptr< Kernel > refactor() const
Refactor the kernel as a linear combination of N bases where N is the number of parameters for the sp...
Definition: LinearCombinationKernel.cc:156

lsst.afw.image::X
Definition: ImageUtils.h:40

lsst.afw.math.detail::ConvolveGpuStatus::KERNEL_TOO_BIG
Definition: ConvolveGPU.h:57

lsst.afw.math.detail::ConvolveGpuStatus::SFN_COUNT_ERROR
Definition: ConvolveGPU.h:59

lsst.afw.math::LinearCombinationKernel::getNBasisKernels
int getNBasisKernels() const
Get the number of basis kernels.
Definition: Kernel.h:875

CudaSelectGpu.h
Functions to help managing setup for GPU kernels.

ConvolveImage.h
Convolve and convolveAtAPoint functions for Image and Kernel.

lsst.afw.math.detail::KerPixel
double KerPixel
Definition: convCUDA.h:46

LSST_EXCEPT
#define LSST_EXCEPT(type,...)
Definition: Exception.h:46

INSTANTIATE
#define INSTANTIATE(T)
Definition: ApertureFlux.cc:282

lsst.afw.math.detail::assertDimensionsOK
void assertDimensionsOK(OutImageT const &convolvedImage, InImageT const &inImage, lsst::afw::math::Kernel const &kernel)
Definition: convCpuGpuShared.cc:59

lsst.afw.math.detail::ConvolveGpuStatus::OK
Definition: ConvolveGPU.h:57

lsst.afw.math.detail::ConvolveGpuStatus::KERNEL_TOO_SMALL
Definition: ConvolveGPU.h:57

lsst.afw.gpu::detail::GpuBuffer2D::Init
void Init(const ImageT &image)
Definition: GpuBuffer2D.h:71

lsst.afw.math.detail::sftChebyshev
Definition: cudaConvWrapper.h:45

MaskedImage.h
Implementation of the Class MaskedImage.

lsst.afw.image::ImageBase::getHeight
int getHeight() const
Return the number of rows in the image.
Definition: Image.h:239

lsst.afw.gpu::detail::GpuBuffer2D::width
int width
Definition: GpuBuffer2D.h:60

lsst.afw.math::KernelList
std::vector< boost::shared_ptr< Kernel > > KernelList
Definition: Kernel.h:542

lsst.afw.math.detail::sftPolynomial
Definition: cudaConvWrapper.h:45

lsst.afw.math::Kernel::getCtrY
int getCtrY() const
Return y index of kernel&#39;s center.
Definition: Kernel.h:272

lsst.afw.image::MaskedImage::getHeight
int getHeight() const
Return the number of rows in the image.
Definition: MaskedImage.h:903

ConvolveGPU.h
Convolution support.

lsst.afw.image::Y
Definition: ImageUtils.h:40

lsst.afw.math::Kernel
Kernels are used for convolution with MaskedImages and (eventually) Images.
Definition: Kernel.h:134

lsst.afw.image::Image
A class to represent a 2-dimensional array of pixels.
Definition: PSF.h:43

IS_INSTANCE
#define IS_INSTANCE(A, B)
Definition: Convolve.h:48

lsst.afw.math::DeltaFunctionKernel
A kernel that has only one non-zero pixel (of value 1)
Definition: Kernel.h:744

lsst.afw.math.detail::MskPixel
lsst::afw::image::MaskPixel MskPixel
Definition: convCUDA.h:45

lsst.afw.math.detail::IsSufficientSharedMemoryAvailable_ForImgAndMaskBlock
bool IsSufficientSharedMemoryAvailable_ForImgAndMaskBlock(int filterW, int filterH, int pixSize)
Definition: cudaConvWrapper.cc:55

lsst.afw.math::ConvolutionControl::getDevicePreference
lsst::afw::gpu::DevicePreference getDevicePreference() const
Definition: ConvolveImage.h:79

lsst.afw.math::Kernel::computeImage
double computeImage(lsst::afw::image::Image< Pixel > &image, bool doNormalize, double x=0.0, double y=0.0) const
Compute an image (pixellized representation of the kernel) in place.
Definition: Kernel.cc:94

exceptions.h
Include files required for standard LSST Exception handling.

IsGpuBuild.h
A function to determine whether compiling for GPU is enabled.

lsst.afw.math.detail::basicConvolveGPU
ConvolveGpuStatus::ReturnCode basicConvolveGPU(OutImageT &convolvedImage, InImageT const &inImage, lsst::afw::math::Kernel const &kernel, lsst::afw::math::ConvolutionControl const &convolutionControl)

lsst.afw.math::Kernel::getCtrX
int getCtrX() const
Return x index of kernel&#39;s center.
Definition: Kernel.h:263