initial commit

This commit is contained in:
Ondrej Jamriska
2018-04-16 04:23:07 +02:00
parent 31f7827d7d
commit 784ca88cc3
45 changed files with 12034 additions and 2 deletions

20
src/cudacheck.h Normal file
View File

@@ -0,0 +1,20 @@
#ifndef CUDACHECK_H_
#define CUDACHECK_H_
template<typename T>
bool checkCudaError_(T result,char const* const func,const char* const file,int const line)
{
if (result)
{
printf("CUDA error at %s:%d code=%d \"%s\"\n",file,line,static_cast<unsigned int>(result),func);
return true;
}
else
{
return false;
}
}
#define checkCudaError(val) checkCudaError_((val),#val,__FILE__,__LINE__)
#endif

1270
src/ebsynth.cu Normal file

File diff suppressed because it is too large Load Diff

1901
src/jzq.h Normal file

File diff suppressed because it is too large Load Diff

73
src/memarray2.h Normal file
View File

@@ -0,0 +1,73 @@
// This software is in the public domain. Where that dedication is not
// recognized, you are granted a perpetual, irrevocable license to copy
// and modify this file as you see fit.
#ifndef MEMARRAY2_H_
#define MEMARRAY2_H_
#include "jzq.h"
//#include "cudacheck.h"
template<typename T>
struct MemArray2
{
T* data;
int width;
int height;
MemArray2() : width(0),height(0),data(0) {};
MemArray2(const V2i& size)
{
width = size(0);
height = size(1);
checkCudaError(cudaMalloc(&data,width*height*sizeof(T)));
}
MemArray2(int _width,int _height)
{
width = _width;
height = _height;
checkCudaError(cudaMalloc(&data,width*height*sizeof(T)));
}
/*
int __device__ operator()(int i,int j)
{
return data[i+j*width];
}
const int& __device__ operator()(int i,int j) const
{
return data[i+j*width];
}
*/
void destroy()
{
checkCudaError( cudaFree(data) );
}
};
template<typename T>
void copy(MemArray2<T>* out_dst,const Array2<T>& src)
{
assert(out_dst != 0);
MemArray2<T>& dst = *out_dst;
assert(dst.width == src.width());
assert(dst.height == src.height());
checkCudaError(cudaMemcpy(dst.data, src.data(), src.width()*src.height()*sizeof(T), cudaMemcpyHostToDevice));
}
template<typename T>
void copy(Array2<T>* out_dst,const MemArray2<T>& src)
{
assert(out_dst != 0);
const Array2<T>& dst = *out_dst;
assert(dst.width() == src.width);
assert(dst.height() == src.height);
checkCudaError(cudaMemcpy((void*)dst.data(),src.data, src.width*src.height*sizeof(T), cudaMemcpyDeviceToHost));
}
#endif

342
src/patchmatch_gpu.h Normal file
View File

@@ -0,0 +1,342 @@
// This software is in the public domain. Where that dedication is not
// recognized, you are granted a perpetual, irrevocable license to copy
// and modify this file as you see fit.
#ifndef PATCHMATCH_GPU_H_
#define PATCHMATCH_GPU_H_
#include <cfloat>
#include <curand.h>
#include <curand_kernel.h>
#include "texarray2.h"
#include "memarray2.h"
typedef Vec<1,float> V1f;
typedef Array2<Vec<1,float>> A2V1f;
__global__ void krnlInitRngStates(const int width,
const int height,
curandState* rngStates)
{
const int x = blockDim.x*blockIdx.x + threadIdx.x;
const int y = blockDim.y*blockIdx.y + threadIdx.y;
if (x<width && y<height)
{
const int idx = x+y*width;
curand_init((1337 << 20) + idx, 0, 0, &rngStates[idx]);
}
}
curandState* initGpuRng(const int width,
const int height)
{
curandState* gpuRngStates;
cudaMalloc(&gpuRngStates,width*height*sizeof(curandState));
const dim3 threadsPerBlock(16,16);
const dim3 numBlocks((width+threadsPerBlock.x)/threadsPerBlock.x,
(height+threadsPerBlock.y)/threadsPerBlock.y);
krnlInitRngStates<<<numBlocks,threadsPerBlock>>>(width,height,gpuRngStates);
return gpuRngStates;
}
template<int N,typename T,int M>
struct PatchSSD
{
const TexArray2<N,T,M> A;
const TexArray2<N,T,M> B;
const Vec<N,float> weights;
PatchSSD(const TexArray2<N,T,M>& A,
const TexArray2<N,T,M>& B,
const Vec<N,float>& weights)
: A(A),B(B),weights(weights) {}
__device__ float operator()(int patchWidth,
const int ax,
const int ay,
const int bx,
const int by,
const float ebest)
{
const int hpw = patchWidth/2;
float ssd = 0;
for(int py=-hpw;py<=+hpw;py++)
{
for(int px=-hpw;px<=+hpw;px++)
{
const Vec<N,T> pixelA = A(ax + px, ay + py);
const Vec<N,T> pixelB = B(bx + px, by + py);
for(int i=0;i<N;i++)
{
const float diff = float(pixelA[i])-float(pixelB[i]);
ssd += weights[i]*diff*diff;
}
}
if (ssd>ebest) { return ssd; }
}
return ssd;
}
};
template<typename FUNC>
__global__ void krnlEvalErrorPass(const int patchWidth,
FUNC patchError,
const TexArray2<2,int> NNF,
TexArray2<1,float> E)
{
const int x = blockDim.x*blockIdx.x + threadIdx.x;
const int y = blockDim.y*blockIdx.y + threadIdx.y;
if (x<NNF.width && y<NNF.height)
{
const V2i n = NNF(x,y);
E.write(x,y,V1f(patchError(patchWidth,x,y,n[0],n[1],FLT_MAX)));
}
}
void __device__ updateOmega(MemArray2<int>& Omega,const int patchWidth,const int bx,const int by,const int incdec)
{
const int r = patchWidth/2;
for(int oy=-r;oy<=+r;oy++)
for(int ox=-r;ox<=+r;ox++)
{
const int x = bx+ox;
const int y = by+oy;
atomicAdd(&Omega.data[x+y*Omega.width],incdec);
//Omega.data[x+y*Omega.width] += incdec;
}
}
int __device__ patchOmega(const int patchWidth,const int bx,const int by,const MemArray2<int>& Omega)
{
const int r = patchWidth/2;
int sum = 0;
for(int oy=-r;oy<=+r;oy++)
for(int ox=-r;ox<=+r;ox++)
{
const int x = bx+ox;
const int y = by+oy;
sum += Omega.data[x+y*Omega.width]; /// XXX: atomic read instead ??
}
return sum;
}
template<typename FUNC>
__device__ void tryPatch(const V2i& sizeA,
const V2i& sizeB,
MemArray2<int>& Omega,
const int patchWidth,
FUNC patchError,
const float lambda,
const int ax,
const int ay,
const int bx,
const int by,
V2i& nbest,
float& ebest)
{
const float omegaBest = (float(sizeA(0)*sizeA(1)) /
float(sizeB(0)*sizeB(1))) * float(patchWidth*patchWidth);
const float curOcc = (float(patchOmega(patchWidth,nbest(0),nbest(1),Omega))/float(patchWidth*patchWidth))/omegaBest;
const float newOcc = (float(patchOmega(patchWidth, bx, by,Omega))/float(patchWidth*patchWidth))/omegaBest;
const float curErr = ebest;
const float newErr = patchError(patchWidth,ax,ay,bx,by,curErr+lambda*curOcc);
if ((newErr+lambda*newOcc) < (curErr+lambda*curOcc))
{
updateOmega(Omega,patchWidth, bx, by,+1);
updateOmega(Omega,patchWidth,nbest(0),nbest(1),-1);
nbest = V2i(bx,by);
ebest = newErr;
}
}
template<typename FUNC>
__device__ void tryNeighborsOffset(const int x,
const int y,
const int ox,
const int oy,
V2i& nbest,
float& ebest,
const V2i& sizeA,
const V2i& sizeB,
MemArray2<int>& Omega,
const int patchWidth,
FUNC patchError,
const float lambda,
const TexArray2<2,int>& NNF)
{
const int hpw = patchWidth/2;
const V2i on = NNF(x+ox,y+oy);
const int nx = on(0)-ox;
const int ny = on(1)-oy;
if (nx>=hpw && nx<sizeB(0)-hpw &&
ny>=hpw && ny<sizeB(1)-hpw)
{
tryPatch(sizeA,sizeB,Omega,patchWidth,patchError,lambda,x,y,nx,ny,nbest,ebest);
}
}
template<typename FUNC>
__global__ void krnlPropagationPass(const V2i sizeA,
const V2i sizeB,
MemArray2<int> Omega,
const int patchWidth,
FUNC patchError,
const float lambda,
const int r,
const TexArray2<2,int> NNF,
TexArray2<2,int> NNF2,
TexArray2<1,float> E,
TexArray2<1,unsigned char> mask)
{
const int x = blockDim.x*blockIdx.x + threadIdx.x;
const int y = blockDim.y*blockIdx.y + threadIdx.y;
if (x<sizeA(0) && y<sizeA(1))
{
V2i nbest = NNF(x,y);
float ebest = E(x,y)(0);
if (mask(x,y)[0]==255)
{
tryNeighborsOffset(x,y,-r,0,nbest,ebest,sizeA,sizeB,Omega,patchWidth,patchError,lambda,NNF);
tryNeighborsOffset(x,y,+r,0,nbest,ebest,sizeA,sizeB,Omega,patchWidth,patchError,lambda,NNF);
tryNeighborsOffset(x,y,0,-r,nbest,ebest,sizeA,sizeB,Omega,patchWidth,patchError,lambda,NNF);
tryNeighborsOffset(x,y,0,+r,nbest,ebest,sizeA,sizeB,Omega,patchWidth,patchError,lambda,NNF);
}
E.write(x,y,V1f(ebest));
NNF2.write(x,y,nbest);
}
}
template<typename FUNC>
__device__ void tryRandomOffsetInRadius(const int r,
const V2i& sizeA,
const V2i& sizeB,
MemArray2<int>& Omega,
const int patchWidth,
FUNC patchError,
const float lambda,
const int x,
const int y,
const V2i& norg,
V2i& nbest,
float& ebest,
curandState* rngState)
{
const int hpw = patchWidth/2;
const int xmin = max(norg(0)-r,hpw);
const int xmax = min(norg(0)+r,sizeB(0)-1-hpw);
const int ymin = max(norg(1)-r,hpw);
const int ymax = min(norg(1)+r,sizeB(1)-1-hpw);
const int nx = xmin+(curand(rngState)%(xmax-xmin+1));
const int ny = ymin+(curand(rngState)%(ymax-ymin+1));
tryPatch(sizeA,sizeB,Omega,patchWidth,patchError,lambda,x,y,nx,ny,nbest,ebest);
}
template<typename FUNC>
__global__ void krnlRandomSearchPass(const V2i sizeA,
const V2i sizeB,
MemArray2<int> Omega,
const int patchWidth,
FUNC patchError,
const float lambda,
TexArray2<2,int> NNF,
TexArray2<1,float> E,
TexArray2<1,unsigned char> mask,
curandState* rngStates)
{
const int x = blockDim.x*blockIdx.x + threadIdx.x;
const int y = blockDim.y*blockIdx.y + threadIdx.y;
if (x<sizeA(0) && y<sizeA(1))
{
if (mask(x,y)[0]==255)
{
V2i nbest = NNF(x,y);
float ebest = E(x,y)(0);
const V2i norg = nbest;
for(int r=1;r<max(sizeB(0),sizeB(1))/2;r=r*2)
{
tryRandomOffsetInRadius(r,sizeA,sizeB,Omega,patchWidth,patchError,lambda,x,y,norg,nbest,ebest,&rngStates[x+y*NNF.width]);
}
E.write(x,y,V1f(ebest));
NNF.write(x,y,nbest);
}
}
}
template<typename FUNC>
void patchmatchGPU(const V2i sizeA,
const V2i sizeB,
MemArray2<int>& Omega,
const int patchWidth,
FUNC patchError,
const float lambda,
const int numIters,
const int numThreadsPerBlock,
TexArray2<2,int>& NNF,
TexArray2<2,int>& NNF2,
TexArray2<1,float>& E,
TexArray2<1,unsigned char>& mask,
curandState* rngStates)
{
const dim3 threadsPerBlock = dim3(numThreadsPerBlock,numThreadsPerBlock);
const dim3 numBlocks = dim3((NNF.width+threadsPerBlock.x)/threadsPerBlock.x,
(NNF.height+threadsPerBlock.y)/threadsPerBlock.y);
krnlEvalErrorPass<<<numBlocks,threadsPerBlock>>>(patchWidth,patchError,NNF,E);
checkCudaError(cudaDeviceSynchronize());
for(int i=0;i<numIters;i++)
{
krnlPropagationPass<<<numBlocks,threadsPerBlock>>>(sizeA,sizeB,Omega,patchWidth,patchError,lambda,4,NNF,NNF2,E,mask); std::swap(NNF,NNF2);
checkCudaError(cudaDeviceSynchronize());
krnlPropagationPass<<<numBlocks,threadsPerBlock>>>(sizeA,sizeB,Omega,patchWidth,patchError,lambda,2,NNF,NNF2,E,mask); std::swap(NNF,NNF2);
checkCudaError(cudaDeviceSynchronize());
krnlPropagationPass<<<numBlocks,threadsPerBlock>>>(sizeA,sizeB,Omega,patchWidth,patchError,lambda,1,NNF,NNF2,E,mask); std::swap(NNF,NNF2);
checkCudaError(cudaDeviceSynchronize());
krnlRandomSearchPass<<<numBlocks,threadsPerBlock>>>(sizeA,sizeB,Omega,patchWidth,patchError,lambda,NNF,E,mask,rngStates);
checkCudaError(cudaDeviceSynchronize());
}
krnlEvalErrorPass<<<numBlocks,threadsPerBlock>>>(patchWidth,patchError,NNF,E);
checkCudaError(cudaDeviceSynchronize());
}
#endif

6755
src/stb_image.h Normal file

File diff suppressed because it is too large Load Diff

1048
src/stb_image_write.h Normal file

File diff suppressed because it is too large Load Diff

300
src/texarray2.h Normal file
View File

@@ -0,0 +1,300 @@
// This software is in the public domain. Where that dedication is not
// recognized, you are granted a perpetual, irrevocable license to copy
// and modify this file as you see fit.
#ifndef TEXARRAY2_H_
#define TEXARRAY2_H_
#include "jzq.h"
#include "cudacheck.h"
#include <cuda_runtime.h>
template<int N, typename T>
struct CudaVec { };
template<> struct CudaVec<1, unsigned char> { typedef uchar1 type; };
template<> struct CudaVec<2, unsigned char> { typedef uchar2 type; };
template<> struct CudaVec<4, unsigned char> { typedef uchar4 type; };
template<> struct CudaVec<1, int> { typedef int1 type; };
template<> struct CudaVec<2, int> { typedef int2 type; };
template<> struct CudaVec<4, int> { typedef int4 type; };
template<> struct CudaVec<1, float> { typedef float1 type; };
template<> struct CudaVec<2, float> { typedef float2 type; };
template<> struct CudaVec<4, float> { typedef float4 type; };
template<typename T>
struct CudaKind { };
template<> struct CudaKind<unsigned char> { static const cudaChannelFormatKind kind = cudaChannelFormatKindUnsigned; };
template<> struct CudaKind<int> { static const cudaChannelFormatKind kind = cudaChannelFormatKindSigned; };
template<> struct CudaKind<float> { static const cudaChannelFormatKind kind = cudaChannelFormatKindFloat; };
__device__ Vec<1, unsigned char> cuda2jzq(const uchar1& vec) { return Vec<1, unsigned char>(vec.x); }
__device__ Vec<2, unsigned char> cuda2jzq(const uchar2& vec) { return Vec<2, unsigned char>(vec.x, vec.y); }
__device__ Vec<4, unsigned char> cuda2jzq(const uchar4& vec) { return Vec<4, unsigned char>(vec.x, vec.y, vec.z, vec.w); }
__device__ Vec<1, int> cuda2jzq(const int1& vec) { return Vec<1, int>(vec.x); }
__device__ Vec<2, int> cuda2jzq(const int2& vec) { return Vec<2, int>(vec.x, vec.y); }
__device__ Vec<4, int> cuda2jzq(const int4& vec) { return Vec<4, int>(vec.x, vec.y, vec.z, vec.w); }
__device__ Vec<1, float> cuda2jzq(const float1& vec) { return Vec<1, float>(vec.x); }
__device__ Vec<2, float> cuda2jzq(const float2& vec) { return Vec<2, float>(vec.x, vec.y); }
__device__ Vec<4, float> cuda2jzq(const float4& vec) { return Vec<4, float>(vec.x, vec.y, vec.z, vec.w); }
#define N_LAYERS(N,M) 1+(N-1)/M
template<int N, typename T>
struct TexLayer2
{
size_t pitch;
void* data;
cudaTextureObject_t texObj;
TexLayer2(){};
TexLayer2(int width, int height)
{
checkCudaError(cudaMallocPitch(&data, &pitch, width*N*sizeof(T), height));
const int bits = 8 * sizeof(T);
const int bitsTable[4][4] = { { bits, 0, 0, 0 },
{ bits, bits, 0, 0 },
{ -1, -1, -1, -1 },
{ bits, bits, bits, bits } };
cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
resDesc.resType = cudaResourceTypePitch2D;
resDesc.res.pitch2D.devPtr = data;
resDesc.res.pitch2D.pitchInBytes = pitch;
resDesc.res.pitch2D.width = width;
resDesc.res.pitch2D.height = height;
resDesc.res.pitch2D.desc = cudaCreateChannelDesc(bitsTable[N - 1][0],
bitsTable[N - 1][1],
bitsTable[N - 1][2],
bitsTable[N - 1][3],
CudaKind<T>::kind);
cudaTextureDesc texDesc;
memset(&texDesc, 0, sizeof(texDesc));
texDesc.addressMode[0] = cudaAddressModeClamp;
texDesc.addressMode[1] = cudaAddressModeClamp;
texDesc.filterMode = cudaFilterModePoint;
texDesc.readMode = cudaReadModeElementType;
texDesc.normalizedCoords = 0;
texObj = 0;
checkCudaError(cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL));
}
Vec<N, T> __device__ operator()(int x, int y) const
{
return cuda2jzq(tex2D<CudaVec<N, T>::type>(texObj, x, y));
}
void __device__ write(int x, int y, const Vec<N, T>& value)
{
Vec<N, T>* ptr = (Vec<N, T>*)&((unsigned char*)data)[x*sizeof(Vec<N, T>) + y*pitch];
*ptr = value;
}
void destroy()
{
checkCudaError( cudaDestroyTextureObject(texObj) );
checkCudaError( cudaFree(data) );
}
};
template<int N, typename T, int M = N<3 ? N : 4>
struct TexArray2
{
int width;
int height;
TexLayer2<M, T> texLayers[N_LAYERS(N, M)];
size_t tmp_pitch;
void* tmp_data;
TexArray2() : width(0),height(0),tmp_pitch(0),tmp_data(0) { }
TexArray2(const V2i& size)
{
width = size(0);
height = size(1);
checkCudaError(cudaMallocPitch(&tmp_data, &tmp_pitch, width*N*sizeof(T), height));
for (int i = 0; i < N_LAYERS(N, M); ++i)
texLayers[i] = TexLayer2<M, T>(width, height);
}
TexArray2(int width, int height)
{
this->width = width;
this->height = height;
checkCudaError(cudaMallocPitch(&tmp_data, &tmp_pitch, width*N*sizeof(T), height));
for (int i = 0; i < N_LAYERS(N, M); ++i)
texLayers[i] = TexLayer2<M, T>(width, height);
}
Vec<N, T> __device__ operator()(int x, int y) const
{
Vec<N, T> ret;
Vec<M, T> tmp;
for (int i = 0; i < N / M; ++i){
tmp = texLayers[i](x, y);
for (int j = 0; j < M; ++j)
ret[i*M + j] = tmp[j];
}
if (N % M != 0){
tmp = texLayers[N / M](x, y);
for (int j = 0; j < N % M; ++j)
ret[(N / M)*M + j] = tmp[j];
}
return ret;
}
void __device__ write(int x, int y, const Vec<N, T>& value)
{
Vec<M, T> tmp;
for (int i = 0; i < N / M; ++i){
for (int j = 0; j < M; ++j)
tmp[j] = value[i*M + j];
texLayers[i].write(x, y, tmp);
}
if (N % M != 0){
for (int j = 0; j < N % M; ++j)
tmp[j] = value[(N / M)*M + j];
texLayers[N / M].write(x, y, tmp);
}
}
V2i size() const
{
return V2i(width,height);
}
void destroy()
{
for (int i = 0; i < N_LAYERS(N, M); ++i)
{
texLayers[i].destroy();
}
checkCudaError( cudaFree(tmp_data) );
}
};
template<int N, typename T, int M>
__global__ void tmpToLayers(TexArray2<N, T, M> A)
{
const int x = blockDim.x*blockIdx.x + threadIdx.x;
const int y = blockDim.y*blockIdx.y + threadIdx.y;
if (x<A.width && y<A.height)
{
Vec<N, T>* ptr = (Vec<N, T>*)&((unsigned char*)A.tmp_data)[x*sizeof(Vec<N, T>) + y*A.tmp_pitch];
A.write(x, y, *ptr);
}
}
template<int N, typename T, int M>
__global__ void layersToTmp(const TexArray2<N, T, M> A)
{
const int x = blockDim.x*blockIdx.x + threadIdx.x;
const int y = blockDim.y*blockIdx.y + threadIdx.y;
if (x<A.width && y<A.height)
{
Vec<N, T> value = A(x, y);
Vec<N, T>* ptr = (Vec<N, T>*)&((unsigned char*)A.tmp_data)[x*sizeof(Vec<N, T>) + y*A.tmp_pitch];
*ptr = value;
}
}
template<int N, typename T, int M>
void copy(TexArray2<N, T, M>* out_dst,const Array2<Vec<N, T>>& src)
{
assert(out_dst != 0);
const TexArray2<N, T, M>& dst = *out_dst;
assert(dst.width == src.width());
assert(dst.height == src.height());
const int srcWidthInBytes = src.width()*sizeof(Vec<N, T>);
const int srcPitchInBytes = srcWidthInBytes;
checkCudaError(cudaMemcpy2D(dst.tmp_data, dst.tmp_pitch, src.data(), srcPitchInBytes, srcWidthInBytes, src.height(), cudaMemcpyHostToDevice));
const int numThreadsPerBlock = 16;
const dim3 threadsPerBlock = dim3(numThreadsPerBlock, numThreadsPerBlock);
const dim3 numBlocks = dim3((src.width() + threadsPerBlock.x) / threadsPerBlock.x,
(src.height() + threadsPerBlock.y) / threadsPerBlock.y);
tmpToLayers << <numBlocks, threadsPerBlock >> >(dst);
}
template<int N, typename T, int M>
void copy(TexArray2<N, T, M>* out_dst,void* src_data)
{
assert(out_dst != 0);
const TexArray2<N, T, M>& dst = *out_dst;
const int srcWidthInBytes = dst.width*sizeof(Vec<N,T>);
const int srcPitchInBytes = srcWidthInBytes;
checkCudaError(cudaMemcpy2D(dst.tmp_data, dst.tmp_pitch, src_data, srcPitchInBytes, srcWidthInBytes, dst.height, cudaMemcpyHostToDevice));
const int numThreadsPerBlock = 16;
const dim3 threadsPerBlock = dim3(numThreadsPerBlock, numThreadsPerBlock);
const dim3 numBlocks = dim3((dst.width + threadsPerBlock.x) / threadsPerBlock.x,
(dst.height + threadsPerBlock.y) / threadsPerBlock.y);
tmpToLayers << <numBlocks, threadsPerBlock >> >(dst);
}
template<int N, typename T, int M>
void copy(Array2<Vec<N, T>>* out_dst, const TexArray2<N, T, M>& src)
{
assert(out_dst != 0);
const Array2<Vec<N, T>>& dst = *out_dst;
assert(dst.width() == src.width);
assert(dst.height() == src.height);
const int numThreadsPerBlock = 16;
const dim3 threadsPerBlock = dim3(numThreadsPerBlock, numThreadsPerBlock);
const dim3 numBlocks = dim3((dst.width() + threadsPerBlock.x) / threadsPerBlock.x,
(dst.height() + threadsPerBlock.y) / threadsPerBlock.y);
layersToTmp << <numBlocks, threadsPerBlock >> >(src);
const int dstPitchInBytes = dst.width()*sizeof(Vec<N, T>);
checkCudaError(cudaMemcpy2D((void*)dst.data(), dstPitchInBytes, src.tmp_data, src.tmp_pitch, src.width*N*sizeof(T), src.height, cudaMemcpyDeviceToHost));
}
template<int N, typename T, int M>
void copy(void** out_dst_data, const TexArray2<N, T, M>& src)
{
const int numThreadsPerBlock = 16;
const dim3 threadsPerBlock = dim3(numThreadsPerBlock, numThreadsPerBlock);
const dim3 numBlocks = dim3((src.width + threadsPerBlock.x) / threadsPerBlock.x,
(src.height + threadsPerBlock.y) / threadsPerBlock.y);
layersToTmp << <numBlocks, threadsPerBlock >> >(src);
const int dstPitchInBytes = src.width*sizeof(Vec<N, T>);
checkCudaError(cudaMemcpy2D((void*)*out_dst_data, dstPitchInBytes, src.tmp_data, src.tmp_pitch, src.width*N*sizeof(T), src.height, cudaMemcpyDeviceToHost));
}
#endif