initial commit

2026-04-03 09:46:39 +02:00 · 2018-04-16 04:23:07 +02:00
parent 31f7827d7d
commit 784ca88cc3
45 changed files with 12034 additions and 2 deletions
--- a/src/cudacheck.h
+++ b/src/cudacheck.h
@@ -0,0 +1,20 @@
+#ifndef CUDACHECK_H_
+#define CUDACHECK_H_
+
+template<typename T>
+bool checkCudaError_(T result,char const* const func,const char* const file,int const line)
+{
+  if (result)
+  {
+    printf("CUDA error at %s:%d code=%d \"%s\"\n",file,line,static_cast<unsigned int>(result),func);
+    return true;
+  }
+  else
+  {
+    return false;
+  }
+}
+
+#define checkCudaError(val) checkCudaError_((val),#val,__FILE__,__LINE__)
+
+#endif
--- a/src/ebsynth.cu
+++ b/src/ebsynth.cu
--- a/src/jzq.h
+++ b/src/jzq.h
--- a/src/memarray2.h
+++ b/src/memarray2.h
@@ -0,0 +1,73 @@
+// This software is in the public domain. Where that dedication is not
+// recognized, you are granted a perpetual, irrevocable license to copy
+// and modify this file as you see fit.
+
+#ifndef MEMARRAY2_H_
+#define MEMARRAY2_H_
+
+#include "jzq.h"
+//#include "cudacheck.h"
+
+template<typename T>
+struct MemArray2
+{
+  T* data;
+  int width;
+  int height;
+
+  MemArray2() : width(0),height(0),data(0) {};
+
+  MemArray2(const V2i& size)
+  {
+    width = size(0);
+    height = size(1);
+    checkCudaError(cudaMalloc(&data,width*height*sizeof(T)));
+  }
+
+  MemArray2(int _width,int _height)
+  {
+    width = _width;
+    height = _height;
+    checkCudaError(cudaMalloc(&data,width*height*sizeof(T)));
+  }
+  /*
+  int       __device__ operator()(int i,int j)
+  {
+    return data[i+j*width];
+  }
+
+  const int& __device__ operator()(int i,int j) const
+  {
+    return data[i+j*width];
+  }
+  */
+
+  void destroy()
+  {
+    checkCudaError( cudaFree(data) );
+  }
+};
+
+template<typename T>
+void copy(MemArray2<T>* out_dst,const Array2<T>& src)
+{
+  assert(out_dst != 0);
+  MemArray2<T>& dst = *out_dst;
+  assert(dst.width == src.width());
+  assert(dst.height == src.height());
+
+  checkCudaError(cudaMemcpy(dst.data, src.data(), src.width()*src.height()*sizeof(T), cudaMemcpyHostToDevice));
+}
+
+template<typename T>
+void copy(Array2<T>* out_dst,const MemArray2<T>& src)
+{
+  assert(out_dst != 0);
+  const Array2<T>& dst = *out_dst;
+  assert(dst.width() == src.width);
+  assert(dst.height() == src.height);
+
+  checkCudaError(cudaMemcpy((void*)dst.data(),src.data, src.width*src.height*sizeof(T), cudaMemcpyDeviceToHost));
+}
+
+#endif
--- a/src/patchmatch_gpu.h
+++ b/src/patchmatch_gpu.h
@@ -0,0 +1,342 @@
+// This software is in the public domain. Where that dedication is not
+// recognized, you are granted a perpetual, irrevocable license to copy
+// and modify this file as you see fit.
+
+#ifndef PATCHMATCH_GPU_H_
+#define PATCHMATCH_GPU_H_
+
+#include <cfloat>
+
+#include <curand.h>
+#include <curand_kernel.h>
+
+#include "texarray2.h"
+#include "memarray2.h"
+
+typedef Vec<1,float> V1f;
+typedef Array2<Vec<1,float>> A2V1f;
+
+__global__ void krnlInitRngStates(const int width,
+                                  const int height,
+                                  curandState* rngStates)
+{
+  const int x = blockDim.x*blockIdx.x + threadIdx.x;
+  const int y = blockDim.y*blockIdx.y + threadIdx.y;
+
+  if (x<width && y<height)
+  {
+    const int idx = x+y*width;
+    curand_init((1337 << 20) + idx, 0, 0, &rngStates[idx]);
+  }
+}
+
+curandState* initGpuRng(const int width,
+                        const int height)
+{
+  curandState* gpuRngStates;
+  cudaMalloc(&gpuRngStates,width*height*sizeof(curandState));
+
+  const dim3 threadsPerBlock(16,16);
+  const dim3 numBlocks((width+threadsPerBlock.x)/threadsPerBlock.x,
+                       (height+threadsPerBlock.y)/threadsPerBlock.y);
+
+  krnlInitRngStates<<<numBlocks,threadsPerBlock>>>(width,height,gpuRngStates);
+
+  return gpuRngStates;
+}
+
+template<int N,typename T,int M>
+struct PatchSSD
+{
+  const TexArray2<N,T,M> A;
+  const TexArray2<N,T,M> B;
+  const Vec<N,float> weights;
+
+  PatchSSD(const TexArray2<N,T,M>& A,
+           const TexArray2<N,T,M>& B,
+           const Vec<N,float>& weights)
+
+  : A(A),B(B),weights(weights) {}
+
+   __device__ float operator()(int patchWidth,
+                               const int ax,
+                               const int ay,
+                               const int bx,
+                               const int by,
+                               const float ebest)
+   {
+    const int hpw = patchWidth/2;
+    float ssd = 0;
+
+    for(int py=-hpw;py<=+hpw;py++)
+    {
+      for(int px=-hpw;px<=+hpw;px++)
+      {
+        const Vec<N,T> pixelA = A(ax + px, ay + py);
+        const Vec<N,T> pixelB = B(bx + px, by + py);
+        for(int i=0;i<N;i++)
+        {
+          const float diff = float(pixelA[i])-float(pixelB[i]);
+          ssd += weights[i]*diff*diff;
+        }
+      }
+
+      if (ssd>ebest) { return ssd; }
+    }
+
+    return ssd;
+   }
+};
+
+template<typename FUNC>
+__global__ void krnlEvalErrorPass(const int patchWidth,
+                                  FUNC patchError,
+                                  const TexArray2<2,int> NNF,
+                                  TexArray2<1,float> E)
+{
+  const int x = blockDim.x*blockIdx.x + threadIdx.x;
+  const int y = blockDim.y*blockIdx.y + threadIdx.y;
+
+  if (x<NNF.width && y<NNF.height)
+  {
+    const V2i n = NNF(x,y);
+    E.write(x,y,V1f(patchError(patchWidth,x,y,n[0],n[1],FLT_MAX)));
+  }
+}
+
+void __device__ updateOmega(MemArray2<int>& Omega,const int patchWidth,const int bx,const int by,const int incdec)
+{
+  const int r = patchWidth/2;
+
+  for(int oy=-r;oy<=+r;oy++)
+  for(int ox=-r;ox<=+r;ox++)
+  {
+    const int x = bx+ox;
+    const int y = by+oy;
+    atomicAdd(&Omega.data[x+y*Omega.width],incdec);
+    //Omega.data[x+y*Omega.width] += incdec;
+  }
+}
+
+int __device__ patchOmega(const int patchWidth,const int bx,const int by,const MemArray2<int>& Omega)
+{
+  const int r = patchWidth/2;
+
+  int sum = 0;
+
+  for(int oy=-r;oy<=+r;oy++)
+  for(int ox=-r;ox<=+r;ox++)
+  {
+    const int x = bx+ox;
+    const int y = by+oy;
+    sum += Omega.data[x+y*Omega.width]; /// XXX: atomic read instead ??
+  }
+
+  return sum;
+}
+
+template<typename FUNC>
+__device__ void tryPatch(const  V2i& sizeA,
+                         const  V2i& sizeB,
+                                MemArray2<int>& Omega,
+                         const  int patchWidth,
+                         FUNC   patchError,
+                         const  float lambda,
+                         const  int ax,
+                         const  int ay,
+                         const  int bx,
+                         const  int by,
+                         V2i&   nbest,
+                         float& ebest)
+{
+  const float omegaBest = (float(sizeA(0)*sizeA(1)) /
+                           float(sizeB(0)*sizeB(1))) * float(patchWidth*patchWidth);
+
+  const float curOcc = (float(patchOmega(patchWidth,nbest(0),nbest(1),Omega))/float(patchWidth*patchWidth))/omegaBest;
+  const float newOcc = (float(patchOmega(patchWidth,      bx,      by,Omega))/float(patchWidth*patchWidth))/omegaBest;
+
+  const float curErr = ebest;
+  const float newErr = patchError(patchWidth,ax,ay,bx,by,curErr+lambda*curOcc);
+
+  if ((newErr+lambda*newOcc) < (curErr+lambda*curOcc))
+  {
+    updateOmega(Omega,patchWidth,      bx,      by,+1);
+    updateOmega(Omega,patchWidth,nbest(0),nbest(1),-1);
+    nbest = V2i(bx,by);
+    ebest = newErr;
+  }
+}
+
+template<typename FUNC>
+__device__ void tryNeighborsOffset(const int x,
+                                   const int y,
+                                   const int ox,
+                                   const int oy,
+                                   V2i& nbest,
+                                   float& ebest,
+                                   const V2i& sizeA,
+                                   const V2i& sizeB,
+                                         MemArray2<int>& Omega,
+                                   const int patchWidth,
+                                   FUNC patchError,
+                                   const float lambda,
+                                   const TexArray2<2,int>& NNF)
+{
+  const int hpw = patchWidth/2;
+
+  const V2i on = NNF(x+ox,y+oy);
+  const int nx = on(0)-ox;
+  const int ny = on(1)-oy;
+
+  if (nx>=hpw && nx<sizeB(0)-hpw &&
+      ny>=hpw && ny<sizeB(1)-hpw)
+  {
+    tryPatch(sizeA,sizeB,Omega,patchWidth,patchError,lambda,x,y,nx,ny,nbest,ebest);
+  }
+}
+
+template<typename FUNC>
+__global__ void krnlPropagationPass(const V2i sizeA,
+                                    const V2i sizeB,
+                                          MemArray2<int> Omega,
+                                    const int patchWidth,
+                                    FUNC  patchError,
+                                    const float lambda,
+                                    const int r,
+                                    const TexArray2<2,int> NNF,
+                                    TexArray2<2,int> NNF2,
+                                    TexArray2<1,float> E,
+                                    TexArray2<1,unsigned char> mask)
+{
+  const int x = blockDim.x*blockIdx.x + threadIdx.x;
+  const int y = blockDim.y*blockIdx.y + threadIdx.y;
+
+  if (x<sizeA(0) && y<sizeA(1))
+  {
+    V2i   nbest = NNF(x,y);
+    float ebest = E(x,y)(0);
+
+    if (mask(x,y)[0]==255)
+    {
+      tryNeighborsOffset(x,y,-r,0,nbest,ebest,sizeA,sizeB,Omega,patchWidth,patchError,lambda,NNF);
+      tryNeighborsOffset(x,y,+r,0,nbest,ebest,sizeA,sizeB,Omega,patchWidth,patchError,lambda,NNF);
+      tryNeighborsOffset(x,y,0,-r,nbest,ebest,sizeA,sizeB,Omega,patchWidth,patchError,lambda,NNF);
+      tryNeighborsOffset(x,y,0,+r,nbest,ebest,sizeA,sizeB,Omega,patchWidth,patchError,lambda,NNF);
+    }
+
+    E.write(x,y,V1f(ebest));
+    NNF2.write(x,y,nbest);
+  }
+}
+
+template<typename FUNC>
+__device__ void tryRandomOffsetInRadius(const int r,
+                                        const V2i& sizeA,
+                                        const V2i& sizeB,
+                                              MemArray2<int>& Omega,
+                                        const int patchWidth,
+                                        FUNC  patchError,
+                                        const float lambda,
+                                        const int x,
+                                        const int y,
+                                        const V2i& norg,
+                                        V2i&  nbest,
+                                        float& ebest,
+                                        curandState* rngState)
+{
+  const int hpw = patchWidth/2;
+
+  const int xmin = max(norg(0)-r,hpw);
+  const int xmax = min(norg(0)+r,sizeB(0)-1-hpw);
+  const int ymin = max(norg(1)-r,hpw);
+  const int ymax = min(norg(1)+r,sizeB(1)-1-hpw);
+
+  const int nx = xmin+(curand(rngState)%(xmax-xmin+1));
+  const int ny = ymin+(curand(rngState)%(ymax-ymin+1));
+
+  tryPatch(sizeA,sizeB,Omega,patchWidth,patchError,lambda,x,y,nx,ny,nbest,ebest);
+}
+
+template<typename FUNC>
+__global__ void krnlRandomSearchPass(const V2i sizeA,
+                                     const V2i sizeB,
+                                     MemArray2<int> Omega,
+                                     const int patchWidth,
+                                     FUNC  patchError,
+                                     const float lambda,
+                                     TexArray2<2,int> NNF,
+                                     TexArray2<1,float> E,
+                                     TexArray2<1,unsigned char> mask,
+                                     curandState* rngStates)
+{
+  const int x = blockDim.x*blockIdx.x + threadIdx.x;
+  const int y = blockDim.y*blockIdx.y + threadIdx.y;
+
+  if (x<sizeA(0) && y<sizeA(1))
+  {
+    if (mask(x,y)[0]==255)
+    {
+      V2i nbest = NNF(x,y);
+      float ebest = E(x,y)(0);
+
+      const V2i norg = nbest;
+
+      for(int r=1;r<max(sizeB(0),sizeB(1))/2;r=r*2)
+      {
+        tryRandomOffsetInRadius(r,sizeA,sizeB,Omega,patchWidth,patchError,lambda,x,y,norg,nbest,ebest,&rngStates[x+y*NNF.width]);
+      }
+
+      E.write(x,y,V1f(ebest));
+      NNF.write(x,y,nbest);
+    }
+  }
+}
+
+template<typename FUNC>
+void patchmatchGPU(const V2i sizeA,
+                   const V2i sizeB,
+                   MemArray2<int>& Omega,
+                   const int patchWidth,
+                   FUNC patchError,
+                   const float lambda,
+                   const int numIters,
+                   const int numThreadsPerBlock,
+                   TexArray2<2,int>& NNF,
+                   TexArray2<2,int>& NNF2,
+                   TexArray2<1,float>& E,
+                   TexArray2<1,unsigned char>& mask,
+                   curandState* rngStates)
+{
+  const dim3 threadsPerBlock = dim3(numThreadsPerBlock,numThreadsPerBlock);
+  const dim3 numBlocks = dim3((NNF.width+threadsPerBlock.x)/threadsPerBlock.x,
+                              (NNF.height+threadsPerBlock.y)/threadsPerBlock.y);
+
+  krnlEvalErrorPass<<<numBlocks,threadsPerBlock>>>(patchWidth,patchError,NNF,E);
+
+  checkCudaError(cudaDeviceSynchronize());
+
+  for(int i=0;i<numIters;i++)
+  {
+    krnlPropagationPass<<<numBlocks,threadsPerBlock>>>(sizeA,sizeB,Omega,patchWidth,patchError,lambda,4,NNF,NNF2,E,mask); std::swap(NNF,NNF2);
+
+    checkCudaError(cudaDeviceSynchronize());
+
+    krnlPropagationPass<<<numBlocks,threadsPerBlock>>>(sizeA,sizeB,Omega,patchWidth,patchError,lambda,2,NNF,NNF2,E,mask); std::swap(NNF,NNF2);
+
+    checkCudaError(cudaDeviceSynchronize());
+
+    krnlPropagationPass<<<numBlocks,threadsPerBlock>>>(sizeA,sizeB,Omega,patchWidth,patchError,lambda,1,NNF,NNF2,E,mask); std::swap(NNF,NNF2);
+
+    checkCudaError(cudaDeviceSynchronize());
+
+    krnlRandomSearchPass<<<numBlocks,threadsPerBlock>>>(sizeA,sizeB,Omega,patchWidth,patchError,lambda,NNF,E,mask,rngStates);
+
+    checkCudaError(cudaDeviceSynchronize());
+  }
+
+  krnlEvalErrorPass<<<numBlocks,threadsPerBlock>>>(patchWidth,patchError,NNF,E);
+
+  checkCudaError(cudaDeviceSynchronize());
+}
+
+#endif
--- a/src/stb_image.h
+++ b/src/stb_image.h
--- a/src/stb_image_write.h
+++ b/src/stb_image_write.h
--- a/src/texarray2.h
+++ b/src/texarray2.h
@@ -0,0 +1,300 @@
+// This software is in the public domain. Where that dedication is not
+// recognized, you are granted a perpetual, irrevocable license to copy
+// and modify this file as you see fit.
+
+#ifndef TEXARRAY2_H_
+#define TEXARRAY2_H_
+
+#include "jzq.h"
+#include "cudacheck.h"
+
+#include <cuda_runtime.h>
+
+template<int N, typename T>
+struct CudaVec { };
+
+template<> struct CudaVec<1, unsigned char> { typedef uchar1 type; };
+template<> struct CudaVec<2, unsigned char> { typedef uchar2 type; };
+template<> struct CudaVec<4, unsigned char> { typedef uchar4 type; };
+
+template<> struct CudaVec<1, int> { typedef int1 type; };
+template<> struct CudaVec<2, int> { typedef int2 type; };
+template<> struct CudaVec<4, int> { typedef int4 type; };
+
+template<> struct CudaVec<1, float> { typedef float1 type; };
+template<> struct CudaVec<2, float> { typedef float2 type; };
+template<> struct CudaVec<4, float> { typedef float4 type; };
+
+template<typename T>
+struct CudaKind { };
+
+template<> struct CudaKind<unsigned char> { static const cudaChannelFormatKind kind = cudaChannelFormatKindUnsigned; };
+template<> struct CudaKind<int>           { static const cudaChannelFormatKind kind = cudaChannelFormatKindSigned; };
+template<> struct CudaKind<float>         { static const cudaChannelFormatKind kind = cudaChannelFormatKindFloat; };
+
+__device__ Vec<1, unsigned char> cuda2jzq(const uchar1& vec) { return Vec<1, unsigned char>(vec.x); }
+__device__ Vec<2, unsigned char> cuda2jzq(const uchar2& vec) { return Vec<2, unsigned char>(vec.x, vec.y); }
+__device__ Vec<4, unsigned char> cuda2jzq(const uchar4& vec) { return Vec<4, unsigned char>(vec.x, vec.y, vec.z, vec.w); }
+
+__device__ Vec<1, int> cuda2jzq(const int1& vec) { return Vec<1, int>(vec.x); }
+__device__ Vec<2, int> cuda2jzq(const int2& vec) { return Vec<2, int>(vec.x, vec.y); }
+__device__ Vec<4, int> cuda2jzq(const int4& vec) { return Vec<4, int>(vec.x, vec.y, vec.z, vec.w); }
+
+__device__ Vec<1, float> cuda2jzq(const float1& vec) { return Vec<1, float>(vec.x); }
+__device__ Vec<2, float> cuda2jzq(const float2& vec) { return Vec<2, float>(vec.x, vec.y); }
+__device__ Vec<4, float> cuda2jzq(const float4& vec) { return Vec<4, float>(vec.x, vec.y, vec.z, vec.w); }
+
+#define N_LAYERS(N,M) 1+(N-1)/M
+
+template<int N, typename T>
+struct TexLayer2
+{
+  size_t pitch;
+  void* data;
+  cudaTextureObject_t texObj;
+
+  TexLayer2(){};
+
+  TexLayer2(int width, int height)
+  {
+    checkCudaError(cudaMallocPitch(&data, &pitch, width*N*sizeof(T), height));
+
+    const int bits = 8 * sizeof(T);
+
+    const int bitsTable[4][4] = { { bits, 0, 0, 0 },
+    { bits, bits, 0, 0 },
+    { -1, -1, -1, -1 },
+    { bits, bits, bits, bits } };
+
+    cudaResourceDesc resDesc;
+    memset(&resDesc, 0, sizeof(resDesc));
+    resDesc.resType = cudaResourceTypePitch2D;
+    resDesc.res.pitch2D.devPtr = data;
+    resDesc.res.pitch2D.pitchInBytes = pitch;
+    resDesc.res.pitch2D.width = width;
+    resDesc.res.pitch2D.height = height;
+    resDesc.res.pitch2D.desc = cudaCreateChannelDesc(bitsTable[N - 1][0],
+      bitsTable[N - 1][1],
+      bitsTable[N - 1][2],
+      bitsTable[N - 1][3],
+      CudaKind<T>::kind);
+
+    cudaTextureDesc texDesc;
+    memset(&texDesc, 0, sizeof(texDesc));
+    texDesc.addressMode[0] = cudaAddressModeClamp;
+    texDesc.addressMode[1] = cudaAddressModeClamp;
+    texDesc.filterMode = cudaFilterModePoint;
+    texDesc.readMode = cudaReadModeElementType;
+    texDesc.normalizedCoords = 0;
+
+    texObj = 0;
+    checkCudaError(cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL));
+  }
+
+  Vec<N, T> __device__ operator()(int x, int y) const
+  {
+    return cuda2jzq(tex2D<CudaVec<N, T>::type>(texObj, x, y));
+  }
+
+  void __device__ write(int x, int y, const Vec<N, T>& value)
+  {
+    Vec<N, T>* ptr = (Vec<N, T>*)&((unsigned char*)data)[x*sizeof(Vec<N, T>) + y*pitch];
+    *ptr = value;
+  }
+
+  void destroy()
+  {
+    checkCudaError( cudaDestroyTextureObject(texObj) );
+    checkCudaError( cudaFree(data) );
+  }
+};
+
+template<int N, typename T, int M = N<3 ? N : 4>
+struct TexArray2
+{
+  int width;
+  int height;
+
+  TexLayer2<M, T> texLayers[N_LAYERS(N, M)];
+
+  size_t tmp_pitch;
+  void*  tmp_data;
+
+  TexArray2() : width(0),height(0),tmp_pitch(0),tmp_data(0) { }
+
+  TexArray2(const V2i& size)
+  {
+    width = size(0);
+    height = size(1);
+
+    checkCudaError(cudaMallocPitch(&tmp_data, &tmp_pitch, width*N*sizeof(T), height));
+
+    for (int i = 0; i < N_LAYERS(N, M); ++i)
+      texLayers[i] = TexLayer2<M, T>(width, height);
+  }
+
+  TexArray2(int width, int height)
+  {
+    this->width = width;
+    this->height = height;
+
+    checkCudaError(cudaMallocPitch(&tmp_data, &tmp_pitch, width*N*sizeof(T), height));
+
+    for (int i = 0; i < N_LAYERS(N, M); ++i)
+      texLayers[i] = TexLayer2<M, T>(width, height);
+  }
+
+  Vec<N, T> __device__ operator()(int x, int y) const
+  {
+    Vec<N, T> ret;
+    Vec<M, T> tmp;
+
+    for (int i = 0; i < N / M; ++i){
+      tmp = texLayers[i](x, y);
+      for (int j = 0; j < M; ++j)
+        ret[i*M + j] = tmp[j];
+    }
+
+    if (N % M != 0){
+      tmp = texLayers[N / M](x, y);
+      for (int j = 0; j < N % M; ++j)
+        ret[(N / M)*M + j] = tmp[j];
+    }
+
+    return ret;
+  }
+
+  void __device__ write(int x, int y, const Vec<N, T>& value)
+  {
+    Vec<M, T> tmp;
+
+    for (int i = 0; i < N / M; ++i){
+      for (int j = 0; j < M; ++j)
+        tmp[j] = value[i*M + j];
+      texLayers[i].write(x, y, tmp);
+    }
+
+    if (N % M != 0){
+      for (int j = 0; j < N % M; ++j)
+        tmp[j] = value[(N / M)*M + j];
+      texLayers[N / M].write(x, y, tmp);
+    }
+  }
+
+  V2i size() const
+  {
+    return V2i(width,height);
+  }
+
+  void destroy()
+  {
+    for (int i = 0; i < N_LAYERS(N, M); ++i)
+    {
+      texLayers[i].destroy();
+    }
+
+    checkCudaError( cudaFree(tmp_data) );
+  }
+};
+
+template<int N, typename T, int M>
+__global__ void tmpToLayers(TexArray2<N, T, M> A)
+{
+  const int x = blockDim.x*blockIdx.x + threadIdx.x;
+  const int y = blockDim.y*blockIdx.y + threadIdx.y;
+
+  if (x<A.width && y<A.height)
+  {
+    Vec<N, T>* ptr = (Vec<N, T>*)&((unsigned char*)A.tmp_data)[x*sizeof(Vec<N, T>) + y*A.tmp_pitch];
+    A.write(x, y, *ptr);
+  }
+}
+
+template<int N, typename T, int M>
+__global__ void layersToTmp(const TexArray2<N, T, M> A)
+{
+  const int x = blockDim.x*blockIdx.x + threadIdx.x;
+  const int y = blockDim.y*blockIdx.y + threadIdx.y;
+
+  if (x<A.width && y<A.height)
+  {
+    Vec<N, T> value = A(x, y);
+    Vec<N, T>* ptr = (Vec<N, T>*)&((unsigned char*)A.tmp_data)[x*sizeof(Vec<N, T>) + y*A.tmp_pitch];
+    *ptr = value;
+  }
+}
+
+template<int N, typename T, int M>
+void copy(TexArray2<N, T, M>* out_dst,const Array2<Vec<N, T>>& src)
+{
+  assert(out_dst != 0);
+  const TexArray2<N, T, M>& dst = *out_dst;
+  assert(dst.width == src.width());
+  assert(dst.height == src.height());
+
+  const int srcWidthInBytes = src.width()*sizeof(Vec<N, T>);
+  const int srcPitchInBytes = srcWidthInBytes;
+
+  checkCudaError(cudaMemcpy2D(dst.tmp_data, dst.tmp_pitch, src.data(), srcPitchInBytes, srcWidthInBytes, src.height(), cudaMemcpyHostToDevice));
+
+  const int numThreadsPerBlock = 16;
+  const dim3 threadsPerBlock = dim3(numThreadsPerBlock, numThreadsPerBlock);
+  const dim3 numBlocks = dim3((src.width() + threadsPerBlock.x) / threadsPerBlock.x,
+    (src.height() + threadsPerBlock.y) / threadsPerBlock.y);
+
+  tmpToLayers << <numBlocks, threadsPerBlock >> >(dst);
+}
+
+template<int N, typename T, int M>
+void copy(TexArray2<N, T, M>* out_dst,void* src_data)
+{
+  assert(out_dst != 0);
+  const TexArray2<N, T, M>& dst = *out_dst;
+
+  const int srcWidthInBytes = dst.width*sizeof(Vec<N,T>);
+  const int srcPitchInBytes = srcWidthInBytes;
+
+  checkCudaError(cudaMemcpy2D(dst.tmp_data, dst.tmp_pitch, src_data, srcPitchInBytes, srcWidthInBytes, dst.height, cudaMemcpyHostToDevice));
+
+  const int numThreadsPerBlock = 16;
+  const dim3 threadsPerBlock = dim3(numThreadsPerBlock, numThreadsPerBlock);
+  const dim3 numBlocks = dim3((dst.width + threadsPerBlock.x) / threadsPerBlock.x,
+                              (dst.height + threadsPerBlock.y) / threadsPerBlock.y);
+
+  tmpToLayers << <numBlocks, threadsPerBlock >> >(dst);
+}
+
+template<int N, typename T, int M>
+void copy(Array2<Vec<N, T>>* out_dst, const TexArray2<N, T, M>& src)
+{
+  assert(out_dst != 0);
+  const Array2<Vec<N, T>>& dst = *out_dst;
+  assert(dst.width() == src.width);
+  assert(dst.height() == src.height);
+
+  const int numThreadsPerBlock = 16;
+  const dim3 threadsPerBlock = dim3(numThreadsPerBlock, numThreadsPerBlock);
+  const dim3 numBlocks = dim3((dst.width() + threadsPerBlock.x) / threadsPerBlock.x,
+                              (dst.height() + threadsPerBlock.y) / threadsPerBlock.y);
+
+  layersToTmp << <numBlocks, threadsPerBlock >> >(src);
+
+  const int dstPitchInBytes = dst.width()*sizeof(Vec<N, T>);
+  checkCudaError(cudaMemcpy2D((void*)dst.data(), dstPitchInBytes, src.tmp_data, src.tmp_pitch, src.width*N*sizeof(T), src.height, cudaMemcpyDeviceToHost));
+}
+
+template<int N, typename T, int M>
+void copy(void** out_dst_data, const TexArray2<N, T, M>& src)
+{
+  const int numThreadsPerBlock = 16;
+  const dim3 threadsPerBlock = dim3(numThreadsPerBlock, numThreadsPerBlock);
+  const dim3 numBlocks = dim3((src.width + threadsPerBlock.x) / threadsPerBlock.x,
+                              (src.height + threadsPerBlock.y) / threadsPerBlock.y);
+
+  layersToTmp << <numBlocks, threadsPerBlock >> >(src);
+
+  const int dstPitchInBytes = src.width*sizeof(Vec<N, T>);
+  checkCudaError(cudaMemcpy2D((void*)*out_dst_data, dstPitchInBytes, src.tmp_data, src.tmp_pitch, src.width*N*sizeof(T), src.height, cudaMemcpyDeviceToHost));
+}
+
+#endif