mirror of
https://github.com/jamriska/ebsynth.git
synced 2026-04-03 09:46:39 +02:00
initial commit
This commit is contained in:
20
src/cudacheck.h
Normal file
20
src/cudacheck.h
Normal file
@@ -0,0 +1,20 @@
|
||||
#ifndef CUDACHECK_H_
|
||||
#define CUDACHECK_H_
|
||||
|
||||
template<typename T>
|
||||
bool checkCudaError_(T result,char const* const func,const char* const file,int const line)
|
||||
{
|
||||
if (result)
|
||||
{
|
||||
printf("CUDA error at %s:%d code=%d \"%s\"\n",file,line,static_cast<unsigned int>(result),func);
|
||||
return true;
|
||||
}
|
||||
else
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
#define checkCudaError(val) checkCudaError_((val),#val,__FILE__,__LINE__)
|
||||
|
||||
#endif
|
||||
1270
src/ebsynth.cu
Normal file
1270
src/ebsynth.cu
Normal file
File diff suppressed because it is too large
Load Diff
73
src/memarray2.h
Normal file
73
src/memarray2.h
Normal file
@@ -0,0 +1,73 @@
|
||||
// This software is in the public domain. Where that dedication is not
|
||||
// recognized, you are granted a perpetual, irrevocable license to copy
|
||||
// and modify this file as you see fit.
|
||||
|
||||
#ifndef MEMARRAY2_H_
|
||||
#define MEMARRAY2_H_
|
||||
|
||||
#include "jzq.h"
|
||||
//#include "cudacheck.h"
|
||||
|
||||
template<typename T>
|
||||
struct MemArray2
|
||||
{
|
||||
T* data;
|
||||
int width;
|
||||
int height;
|
||||
|
||||
MemArray2() : width(0),height(0),data(0) {};
|
||||
|
||||
MemArray2(const V2i& size)
|
||||
{
|
||||
width = size(0);
|
||||
height = size(1);
|
||||
checkCudaError(cudaMalloc(&data,width*height*sizeof(T)));
|
||||
}
|
||||
|
||||
MemArray2(int _width,int _height)
|
||||
{
|
||||
width = _width;
|
||||
height = _height;
|
||||
checkCudaError(cudaMalloc(&data,width*height*sizeof(T)));
|
||||
}
|
||||
/*
|
||||
int __device__ operator()(int i,int j)
|
||||
{
|
||||
return data[i+j*width];
|
||||
}
|
||||
|
||||
const int& __device__ operator()(int i,int j) const
|
||||
{
|
||||
return data[i+j*width];
|
||||
}
|
||||
*/
|
||||
|
||||
void destroy()
|
||||
{
|
||||
checkCudaError( cudaFree(data) );
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
void copy(MemArray2<T>* out_dst,const Array2<T>& src)
|
||||
{
|
||||
assert(out_dst != 0);
|
||||
MemArray2<T>& dst = *out_dst;
|
||||
assert(dst.width == src.width());
|
||||
assert(dst.height == src.height());
|
||||
|
||||
checkCudaError(cudaMemcpy(dst.data, src.data(), src.width()*src.height()*sizeof(T), cudaMemcpyHostToDevice));
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void copy(Array2<T>* out_dst,const MemArray2<T>& src)
|
||||
{
|
||||
assert(out_dst != 0);
|
||||
const Array2<T>& dst = *out_dst;
|
||||
assert(dst.width() == src.width);
|
||||
assert(dst.height() == src.height);
|
||||
|
||||
checkCudaError(cudaMemcpy((void*)dst.data(),src.data, src.width*src.height*sizeof(T), cudaMemcpyDeviceToHost));
|
||||
}
|
||||
|
||||
#endif
|
||||
342
src/patchmatch_gpu.h
Normal file
342
src/patchmatch_gpu.h
Normal file
@@ -0,0 +1,342 @@
|
||||
// This software is in the public domain. Where that dedication is not
|
||||
// recognized, you are granted a perpetual, irrevocable license to copy
|
||||
// and modify this file as you see fit.
|
||||
|
||||
#ifndef PATCHMATCH_GPU_H_
|
||||
#define PATCHMATCH_GPU_H_
|
||||
|
||||
#include <cfloat>
|
||||
|
||||
#include <curand.h>
|
||||
#include <curand_kernel.h>
|
||||
|
||||
#include "texarray2.h"
|
||||
#include "memarray2.h"
|
||||
|
||||
typedef Vec<1,float> V1f;
|
||||
typedef Array2<Vec<1,float>> A2V1f;
|
||||
|
||||
__global__ void krnlInitRngStates(const int width,
|
||||
const int height,
|
||||
curandState* rngStates)
|
||||
{
|
||||
const int x = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
const int y = blockDim.y*blockIdx.y + threadIdx.y;
|
||||
|
||||
if (x<width && y<height)
|
||||
{
|
||||
const int idx = x+y*width;
|
||||
curand_init((1337 << 20) + idx, 0, 0, &rngStates[idx]);
|
||||
}
|
||||
}
|
||||
|
||||
curandState* initGpuRng(const int width,
|
||||
const int height)
|
||||
{
|
||||
curandState* gpuRngStates;
|
||||
cudaMalloc(&gpuRngStates,width*height*sizeof(curandState));
|
||||
|
||||
const dim3 threadsPerBlock(16,16);
|
||||
const dim3 numBlocks((width+threadsPerBlock.x)/threadsPerBlock.x,
|
||||
(height+threadsPerBlock.y)/threadsPerBlock.y);
|
||||
|
||||
krnlInitRngStates<<<numBlocks,threadsPerBlock>>>(width,height,gpuRngStates);
|
||||
|
||||
return gpuRngStates;
|
||||
}
|
||||
|
||||
template<int N,typename T,int M>
|
||||
struct PatchSSD
|
||||
{
|
||||
const TexArray2<N,T,M> A;
|
||||
const TexArray2<N,T,M> B;
|
||||
const Vec<N,float> weights;
|
||||
|
||||
PatchSSD(const TexArray2<N,T,M>& A,
|
||||
const TexArray2<N,T,M>& B,
|
||||
const Vec<N,float>& weights)
|
||||
|
||||
: A(A),B(B),weights(weights) {}
|
||||
|
||||
__device__ float operator()(int patchWidth,
|
||||
const int ax,
|
||||
const int ay,
|
||||
const int bx,
|
||||
const int by,
|
||||
const float ebest)
|
||||
{
|
||||
const int hpw = patchWidth/2;
|
||||
float ssd = 0;
|
||||
|
||||
for(int py=-hpw;py<=+hpw;py++)
|
||||
{
|
||||
for(int px=-hpw;px<=+hpw;px++)
|
||||
{
|
||||
const Vec<N,T> pixelA = A(ax + px, ay + py);
|
||||
const Vec<N,T> pixelB = B(bx + px, by + py);
|
||||
for(int i=0;i<N;i++)
|
||||
{
|
||||
const float diff = float(pixelA[i])-float(pixelB[i]);
|
||||
ssd += weights[i]*diff*diff;
|
||||
}
|
||||
}
|
||||
|
||||
if (ssd>ebest) { return ssd; }
|
||||
}
|
||||
|
||||
return ssd;
|
||||
}
|
||||
};
|
||||
|
||||
template<typename FUNC>
|
||||
__global__ void krnlEvalErrorPass(const int patchWidth,
|
||||
FUNC patchError,
|
||||
const TexArray2<2,int> NNF,
|
||||
TexArray2<1,float> E)
|
||||
{
|
||||
const int x = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
const int y = blockDim.y*blockIdx.y + threadIdx.y;
|
||||
|
||||
if (x<NNF.width && y<NNF.height)
|
||||
{
|
||||
const V2i n = NNF(x,y);
|
||||
E.write(x,y,V1f(patchError(patchWidth,x,y,n[0],n[1],FLT_MAX)));
|
||||
}
|
||||
}
|
||||
|
||||
void __device__ updateOmega(MemArray2<int>& Omega,const int patchWidth,const int bx,const int by,const int incdec)
|
||||
{
|
||||
const int r = patchWidth/2;
|
||||
|
||||
for(int oy=-r;oy<=+r;oy++)
|
||||
for(int ox=-r;ox<=+r;ox++)
|
||||
{
|
||||
const int x = bx+ox;
|
||||
const int y = by+oy;
|
||||
atomicAdd(&Omega.data[x+y*Omega.width],incdec);
|
||||
//Omega.data[x+y*Omega.width] += incdec;
|
||||
}
|
||||
}
|
||||
|
||||
int __device__ patchOmega(const int patchWidth,const int bx,const int by,const MemArray2<int>& Omega)
|
||||
{
|
||||
const int r = patchWidth/2;
|
||||
|
||||
int sum = 0;
|
||||
|
||||
for(int oy=-r;oy<=+r;oy++)
|
||||
for(int ox=-r;ox<=+r;ox++)
|
||||
{
|
||||
const int x = bx+ox;
|
||||
const int y = by+oy;
|
||||
sum += Omega.data[x+y*Omega.width]; /// XXX: atomic read instead ??
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
template<typename FUNC>
|
||||
__device__ void tryPatch(const V2i& sizeA,
|
||||
const V2i& sizeB,
|
||||
MemArray2<int>& Omega,
|
||||
const int patchWidth,
|
||||
FUNC patchError,
|
||||
const float lambda,
|
||||
const int ax,
|
||||
const int ay,
|
||||
const int bx,
|
||||
const int by,
|
||||
V2i& nbest,
|
||||
float& ebest)
|
||||
{
|
||||
const float omegaBest = (float(sizeA(0)*sizeA(1)) /
|
||||
float(sizeB(0)*sizeB(1))) * float(patchWidth*patchWidth);
|
||||
|
||||
const float curOcc = (float(patchOmega(patchWidth,nbest(0),nbest(1),Omega))/float(patchWidth*patchWidth))/omegaBest;
|
||||
const float newOcc = (float(patchOmega(patchWidth, bx, by,Omega))/float(patchWidth*patchWidth))/omegaBest;
|
||||
|
||||
const float curErr = ebest;
|
||||
const float newErr = patchError(patchWidth,ax,ay,bx,by,curErr+lambda*curOcc);
|
||||
|
||||
if ((newErr+lambda*newOcc) < (curErr+lambda*curOcc))
|
||||
{
|
||||
updateOmega(Omega,patchWidth, bx, by,+1);
|
||||
updateOmega(Omega,patchWidth,nbest(0),nbest(1),-1);
|
||||
nbest = V2i(bx,by);
|
||||
ebest = newErr;
|
||||
}
|
||||
}
|
||||
|
||||
template<typename FUNC>
|
||||
__device__ void tryNeighborsOffset(const int x,
|
||||
const int y,
|
||||
const int ox,
|
||||
const int oy,
|
||||
V2i& nbest,
|
||||
float& ebest,
|
||||
const V2i& sizeA,
|
||||
const V2i& sizeB,
|
||||
MemArray2<int>& Omega,
|
||||
const int patchWidth,
|
||||
FUNC patchError,
|
||||
const float lambda,
|
||||
const TexArray2<2,int>& NNF)
|
||||
{
|
||||
const int hpw = patchWidth/2;
|
||||
|
||||
const V2i on = NNF(x+ox,y+oy);
|
||||
const int nx = on(0)-ox;
|
||||
const int ny = on(1)-oy;
|
||||
|
||||
if (nx>=hpw && nx<sizeB(0)-hpw &&
|
||||
ny>=hpw && ny<sizeB(1)-hpw)
|
||||
{
|
||||
tryPatch(sizeA,sizeB,Omega,patchWidth,patchError,lambda,x,y,nx,ny,nbest,ebest);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename FUNC>
|
||||
__global__ void krnlPropagationPass(const V2i sizeA,
|
||||
const V2i sizeB,
|
||||
MemArray2<int> Omega,
|
||||
const int patchWidth,
|
||||
FUNC patchError,
|
||||
const float lambda,
|
||||
const int r,
|
||||
const TexArray2<2,int> NNF,
|
||||
TexArray2<2,int> NNF2,
|
||||
TexArray2<1,float> E,
|
||||
TexArray2<1,unsigned char> mask)
|
||||
{
|
||||
const int x = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
const int y = blockDim.y*blockIdx.y + threadIdx.y;
|
||||
|
||||
if (x<sizeA(0) && y<sizeA(1))
|
||||
{
|
||||
V2i nbest = NNF(x,y);
|
||||
float ebest = E(x,y)(0);
|
||||
|
||||
if (mask(x,y)[0]==255)
|
||||
{
|
||||
tryNeighborsOffset(x,y,-r,0,nbest,ebest,sizeA,sizeB,Omega,patchWidth,patchError,lambda,NNF);
|
||||
tryNeighborsOffset(x,y,+r,0,nbest,ebest,sizeA,sizeB,Omega,patchWidth,patchError,lambda,NNF);
|
||||
tryNeighborsOffset(x,y,0,-r,nbest,ebest,sizeA,sizeB,Omega,patchWidth,patchError,lambda,NNF);
|
||||
tryNeighborsOffset(x,y,0,+r,nbest,ebest,sizeA,sizeB,Omega,patchWidth,patchError,lambda,NNF);
|
||||
}
|
||||
|
||||
E.write(x,y,V1f(ebest));
|
||||
NNF2.write(x,y,nbest);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename FUNC>
|
||||
__device__ void tryRandomOffsetInRadius(const int r,
|
||||
const V2i& sizeA,
|
||||
const V2i& sizeB,
|
||||
MemArray2<int>& Omega,
|
||||
const int patchWidth,
|
||||
FUNC patchError,
|
||||
const float lambda,
|
||||
const int x,
|
||||
const int y,
|
||||
const V2i& norg,
|
||||
V2i& nbest,
|
||||
float& ebest,
|
||||
curandState* rngState)
|
||||
{
|
||||
const int hpw = patchWidth/2;
|
||||
|
||||
const int xmin = max(norg(0)-r,hpw);
|
||||
const int xmax = min(norg(0)+r,sizeB(0)-1-hpw);
|
||||
const int ymin = max(norg(1)-r,hpw);
|
||||
const int ymax = min(norg(1)+r,sizeB(1)-1-hpw);
|
||||
|
||||
const int nx = xmin+(curand(rngState)%(xmax-xmin+1));
|
||||
const int ny = ymin+(curand(rngState)%(ymax-ymin+1));
|
||||
|
||||
tryPatch(sizeA,sizeB,Omega,patchWidth,patchError,lambda,x,y,nx,ny,nbest,ebest);
|
||||
}
|
||||
|
||||
template<typename FUNC>
|
||||
__global__ void krnlRandomSearchPass(const V2i sizeA,
|
||||
const V2i sizeB,
|
||||
MemArray2<int> Omega,
|
||||
const int patchWidth,
|
||||
FUNC patchError,
|
||||
const float lambda,
|
||||
TexArray2<2,int> NNF,
|
||||
TexArray2<1,float> E,
|
||||
TexArray2<1,unsigned char> mask,
|
||||
curandState* rngStates)
|
||||
{
|
||||
const int x = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
const int y = blockDim.y*blockIdx.y + threadIdx.y;
|
||||
|
||||
if (x<sizeA(0) && y<sizeA(1))
|
||||
{
|
||||
if (mask(x,y)[0]==255)
|
||||
{
|
||||
V2i nbest = NNF(x,y);
|
||||
float ebest = E(x,y)(0);
|
||||
|
||||
const V2i norg = nbest;
|
||||
|
||||
for(int r=1;r<max(sizeB(0),sizeB(1))/2;r=r*2)
|
||||
{
|
||||
tryRandomOffsetInRadius(r,sizeA,sizeB,Omega,patchWidth,patchError,lambda,x,y,norg,nbest,ebest,&rngStates[x+y*NNF.width]);
|
||||
}
|
||||
|
||||
E.write(x,y,V1f(ebest));
|
||||
NNF.write(x,y,nbest);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename FUNC>
|
||||
void patchmatchGPU(const V2i sizeA,
|
||||
const V2i sizeB,
|
||||
MemArray2<int>& Omega,
|
||||
const int patchWidth,
|
||||
FUNC patchError,
|
||||
const float lambda,
|
||||
const int numIters,
|
||||
const int numThreadsPerBlock,
|
||||
TexArray2<2,int>& NNF,
|
||||
TexArray2<2,int>& NNF2,
|
||||
TexArray2<1,float>& E,
|
||||
TexArray2<1,unsigned char>& mask,
|
||||
curandState* rngStates)
|
||||
{
|
||||
const dim3 threadsPerBlock = dim3(numThreadsPerBlock,numThreadsPerBlock);
|
||||
const dim3 numBlocks = dim3((NNF.width+threadsPerBlock.x)/threadsPerBlock.x,
|
||||
(NNF.height+threadsPerBlock.y)/threadsPerBlock.y);
|
||||
|
||||
krnlEvalErrorPass<<<numBlocks,threadsPerBlock>>>(patchWidth,patchError,NNF,E);
|
||||
|
||||
checkCudaError(cudaDeviceSynchronize());
|
||||
|
||||
for(int i=0;i<numIters;i++)
|
||||
{
|
||||
krnlPropagationPass<<<numBlocks,threadsPerBlock>>>(sizeA,sizeB,Omega,patchWidth,patchError,lambda,4,NNF,NNF2,E,mask); std::swap(NNF,NNF2);
|
||||
|
||||
checkCudaError(cudaDeviceSynchronize());
|
||||
|
||||
krnlPropagationPass<<<numBlocks,threadsPerBlock>>>(sizeA,sizeB,Omega,patchWidth,patchError,lambda,2,NNF,NNF2,E,mask); std::swap(NNF,NNF2);
|
||||
|
||||
checkCudaError(cudaDeviceSynchronize());
|
||||
|
||||
krnlPropagationPass<<<numBlocks,threadsPerBlock>>>(sizeA,sizeB,Omega,patchWidth,patchError,lambda,1,NNF,NNF2,E,mask); std::swap(NNF,NNF2);
|
||||
|
||||
checkCudaError(cudaDeviceSynchronize());
|
||||
|
||||
krnlRandomSearchPass<<<numBlocks,threadsPerBlock>>>(sizeA,sizeB,Omega,patchWidth,patchError,lambda,NNF,E,mask,rngStates);
|
||||
|
||||
checkCudaError(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
krnlEvalErrorPass<<<numBlocks,threadsPerBlock>>>(patchWidth,patchError,NNF,E);
|
||||
|
||||
checkCudaError(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
#endif
|
||||
6755
src/stb_image.h
Normal file
6755
src/stb_image.h
Normal file
File diff suppressed because it is too large
Load Diff
1048
src/stb_image_write.h
Normal file
1048
src/stb_image_write.h
Normal file
File diff suppressed because it is too large
Load Diff
300
src/texarray2.h
Normal file
300
src/texarray2.h
Normal file
@@ -0,0 +1,300 @@
|
||||
// This software is in the public domain. Where that dedication is not
|
||||
// recognized, you are granted a perpetual, irrevocable license to copy
|
||||
// and modify this file as you see fit.
|
||||
|
||||
#ifndef TEXARRAY2_H_
|
||||
#define TEXARRAY2_H_
|
||||
|
||||
#include "jzq.h"
|
||||
#include "cudacheck.h"
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
template<int N, typename T>
|
||||
struct CudaVec { };
|
||||
|
||||
template<> struct CudaVec<1, unsigned char> { typedef uchar1 type; };
|
||||
template<> struct CudaVec<2, unsigned char> { typedef uchar2 type; };
|
||||
template<> struct CudaVec<4, unsigned char> { typedef uchar4 type; };
|
||||
|
||||
template<> struct CudaVec<1, int> { typedef int1 type; };
|
||||
template<> struct CudaVec<2, int> { typedef int2 type; };
|
||||
template<> struct CudaVec<4, int> { typedef int4 type; };
|
||||
|
||||
template<> struct CudaVec<1, float> { typedef float1 type; };
|
||||
template<> struct CudaVec<2, float> { typedef float2 type; };
|
||||
template<> struct CudaVec<4, float> { typedef float4 type; };
|
||||
|
||||
template<typename T>
|
||||
struct CudaKind { };
|
||||
|
||||
template<> struct CudaKind<unsigned char> { static const cudaChannelFormatKind kind = cudaChannelFormatKindUnsigned; };
|
||||
template<> struct CudaKind<int> { static const cudaChannelFormatKind kind = cudaChannelFormatKindSigned; };
|
||||
template<> struct CudaKind<float> { static const cudaChannelFormatKind kind = cudaChannelFormatKindFloat; };
|
||||
|
||||
__device__ Vec<1, unsigned char> cuda2jzq(const uchar1& vec) { return Vec<1, unsigned char>(vec.x); }
|
||||
__device__ Vec<2, unsigned char> cuda2jzq(const uchar2& vec) { return Vec<2, unsigned char>(vec.x, vec.y); }
|
||||
__device__ Vec<4, unsigned char> cuda2jzq(const uchar4& vec) { return Vec<4, unsigned char>(vec.x, vec.y, vec.z, vec.w); }
|
||||
|
||||
__device__ Vec<1, int> cuda2jzq(const int1& vec) { return Vec<1, int>(vec.x); }
|
||||
__device__ Vec<2, int> cuda2jzq(const int2& vec) { return Vec<2, int>(vec.x, vec.y); }
|
||||
__device__ Vec<4, int> cuda2jzq(const int4& vec) { return Vec<4, int>(vec.x, vec.y, vec.z, vec.w); }
|
||||
|
||||
__device__ Vec<1, float> cuda2jzq(const float1& vec) { return Vec<1, float>(vec.x); }
|
||||
__device__ Vec<2, float> cuda2jzq(const float2& vec) { return Vec<2, float>(vec.x, vec.y); }
|
||||
__device__ Vec<4, float> cuda2jzq(const float4& vec) { return Vec<4, float>(vec.x, vec.y, vec.z, vec.w); }
|
||||
|
||||
#define N_LAYERS(N,M) 1+(N-1)/M
|
||||
|
||||
template<int N, typename T>
|
||||
struct TexLayer2
|
||||
{
|
||||
size_t pitch;
|
||||
void* data;
|
||||
cudaTextureObject_t texObj;
|
||||
|
||||
TexLayer2(){};
|
||||
|
||||
TexLayer2(int width, int height)
|
||||
{
|
||||
checkCudaError(cudaMallocPitch(&data, &pitch, width*N*sizeof(T), height));
|
||||
|
||||
const int bits = 8 * sizeof(T);
|
||||
|
||||
const int bitsTable[4][4] = { { bits, 0, 0, 0 },
|
||||
{ bits, bits, 0, 0 },
|
||||
{ -1, -1, -1, -1 },
|
||||
{ bits, bits, bits, bits } };
|
||||
|
||||
cudaResourceDesc resDesc;
|
||||
memset(&resDesc, 0, sizeof(resDesc));
|
||||
resDesc.resType = cudaResourceTypePitch2D;
|
||||
resDesc.res.pitch2D.devPtr = data;
|
||||
resDesc.res.pitch2D.pitchInBytes = pitch;
|
||||
resDesc.res.pitch2D.width = width;
|
||||
resDesc.res.pitch2D.height = height;
|
||||
resDesc.res.pitch2D.desc = cudaCreateChannelDesc(bitsTable[N - 1][0],
|
||||
bitsTable[N - 1][1],
|
||||
bitsTable[N - 1][2],
|
||||
bitsTable[N - 1][3],
|
||||
CudaKind<T>::kind);
|
||||
|
||||
cudaTextureDesc texDesc;
|
||||
memset(&texDesc, 0, sizeof(texDesc));
|
||||
texDesc.addressMode[0] = cudaAddressModeClamp;
|
||||
texDesc.addressMode[1] = cudaAddressModeClamp;
|
||||
texDesc.filterMode = cudaFilterModePoint;
|
||||
texDesc.readMode = cudaReadModeElementType;
|
||||
texDesc.normalizedCoords = 0;
|
||||
|
||||
texObj = 0;
|
||||
checkCudaError(cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL));
|
||||
}
|
||||
|
||||
Vec<N, T> __device__ operator()(int x, int y) const
|
||||
{
|
||||
return cuda2jzq(tex2D<CudaVec<N, T>::type>(texObj, x, y));
|
||||
}
|
||||
|
||||
void __device__ write(int x, int y, const Vec<N, T>& value)
|
||||
{
|
||||
Vec<N, T>* ptr = (Vec<N, T>*)&((unsigned char*)data)[x*sizeof(Vec<N, T>) + y*pitch];
|
||||
*ptr = value;
|
||||
}
|
||||
|
||||
void destroy()
|
||||
{
|
||||
checkCudaError( cudaDestroyTextureObject(texObj) );
|
||||
checkCudaError( cudaFree(data) );
|
||||
}
|
||||
};
|
||||
|
||||
template<int N, typename T, int M = N<3 ? N : 4>
|
||||
struct TexArray2
|
||||
{
|
||||
int width;
|
||||
int height;
|
||||
|
||||
TexLayer2<M, T> texLayers[N_LAYERS(N, M)];
|
||||
|
||||
size_t tmp_pitch;
|
||||
void* tmp_data;
|
||||
|
||||
TexArray2() : width(0),height(0),tmp_pitch(0),tmp_data(0) { }
|
||||
|
||||
TexArray2(const V2i& size)
|
||||
{
|
||||
width = size(0);
|
||||
height = size(1);
|
||||
|
||||
checkCudaError(cudaMallocPitch(&tmp_data, &tmp_pitch, width*N*sizeof(T), height));
|
||||
|
||||
for (int i = 0; i < N_LAYERS(N, M); ++i)
|
||||
texLayers[i] = TexLayer2<M, T>(width, height);
|
||||
}
|
||||
|
||||
TexArray2(int width, int height)
|
||||
{
|
||||
this->width = width;
|
||||
this->height = height;
|
||||
|
||||
checkCudaError(cudaMallocPitch(&tmp_data, &tmp_pitch, width*N*sizeof(T), height));
|
||||
|
||||
for (int i = 0; i < N_LAYERS(N, M); ++i)
|
||||
texLayers[i] = TexLayer2<M, T>(width, height);
|
||||
}
|
||||
|
||||
Vec<N, T> __device__ operator()(int x, int y) const
|
||||
{
|
||||
Vec<N, T> ret;
|
||||
Vec<M, T> tmp;
|
||||
|
||||
for (int i = 0; i < N / M; ++i){
|
||||
tmp = texLayers[i](x, y);
|
||||
for (int j = 0; j < M; ++j)
|
||||
ret[i*M + j] = tmp[j];
|
||||
}
|
||||
|
||||
if (N % M != 0){
|
||||
tmp = texLayers[N / M](x, y);
|
||||
for (int j = 0; j < N % M; ++j)
|
||||
ret[(N / M)*M + j] = tmp[j];
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
void __device__ write(int x, int y, const Vec<N, T>& value)
|
||||
{
|
||||
Vec<M, T> tmp;
|
||||
|
||||
for (int i = 0; i < N / M; ++i){
|
||||
for (int j = 0; j < M; ++j)
|
||||
tmp[j] = value[i*M + j];
|
||||
texLayers[i].write(x, y, tmp);
|
||||
}
|
||||
|
||||
if (N % M != 0){
|
||||
for (int j = 0; j < N % M; ++j)
|
||||
tmp[j] = value[(N / M)*M + j];
|
||||
texLayers[N / M].write(x, y, tmp);
|
||||
}
|
||||
}
|
||||
|
||||
V2i size() const
|
||||
{
|
||||
return V2i(width,height);
|
||||
}
|
||||
|
||||
void destroy()
|
||||
{
|
||||
for (int i = 0; i < N_LAYERS(N, M); ++i)
|
||||
{
|
||||
texLayers[i].destroy();
|
||||
}
|
||||
|
||||
checkCudaError( cudaFree(tmp_data) );
|
||||
}
|
||||
};
|
||||
|
||||
template<int N, typename T, int M>
|
||||
__global__ void tmpToLayers(TexArray2<N, T, M> A)
|
||||
{
|
||||
const int x = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
const int y = blockDim.y*blockIdx.y + threadIdx.y;
|
||||
|
||||
if (x<A.width && y<A.height)
|
||||
{
|
||||
Vec<N, T>* ptr = (Vec<N, T>*)&((unsigned char*)A.tmp_data)[x*sizeof(Vec<N, T>) + y*A.tmp_pitch];
|
||||
A.write(x, y, *ptr);
|
||||
}
|
||||
}
|
||||
|
||||
template<int N, typename T, int M>
|
||||
__global__ void layersToTmp(const TexArray2<N, T, M> A)
|
||||
{
|
||||
const int x = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
const int y = blockDim.y*blockIdx.y + threadIdx.y;
|
||||
|
||||
if (x<A.width && y<A.height)
|
||||
{
|
||||
Vec<N, T> value = A(x, y);
|
||||
Vec<N, T>* ptr = (Vec<N, T>*)&((unsigned char*)A.tmp_data)[x*sizeof(Vec<N, T>) + y*A.tmp_pitch];
|
||||
*ptr = value;
|
||||
}
|
||||
}
|
||||
|
||||
template<int N, typename T, int M>
|
||||
void copy(TexArray2<N, T, M>* out_dst,const Array2<Vec<N, T>>& src)
|
||||
{
|
||||
assert(out_dst != 0);
|
||||
const TexArray2<N, T, M>& dst = *out_dst;
|
||||
assert(dst.width == src.width());
|
||||
assert(dst.height == src.height());
|
||||
|
||||
const int srcWidthInBytes = src.width()*sizeof(Vec<N, T>);
|
||||
const int srcPitchInBytes = srcWidthInBytes;
|
||||
|
||||
checkCudaError(cudaMemcpy2D(dst.tmp_data, dst.tmp_pitch, src.data(), srcPitchInBytes, srcWidthInBytes, src.height(), cudaMemcpyHostToDevice));
|
||||
|
||||
const int numThreadsPerBlock = 16;
|
||||
const dim3 threadsPerBlock = dim3(numThreadsPerBlock, numThreadsPerBlock);
|
||||
const dim3 numBlocks = dim3((src.width() + threadsPerBlock.x) / threadsPerBlock.x,
|
||||
(src.height() + threadsPerBlock.y) / threadsPerBlock.y);
|
||||
|
||||
tmpToLayers << <numBlocks, threadsPerBlock >> >(dst);
|
||||
}
|
||||
|
||||
template<int N, typename T, int M>
|
||||
void copy(TexArray2<N, T, M>* out_dst,void* src_data)
|
||||
{
|
||||
assert(out_dst != 0);
|
||||
const TexArray2<N, T, M>& dst = *out_dst;
|
||||
|
||||
const int srcWidthInBytes = dst.width*sizeof(Vec<N,T>);
|
||||
const int srcPitchInBytes = srcWidthInBytes;
|
||||
|
||||
checkCudaError(cudaMemcpy2D(dst.tmp_data, dst.tmp_pitch, src_data, srcPitchInBytes, srcWidthInBytes, dst.height, cudaMemcpyHostToDevice));
|
||||
|
||||
const int numThreadsPerBlock = 16;
|
||||
const dim3 threadsPerBlock = dim3(numThreadsPerBlock, numThreadsPerBlock);
|
||||
const dim3 numBlocks = dim3((dst.width + threadsPerBlock.x) / threadsPerBlock.x,
|
||||
(dst.height + threadsPerBlock.y) / threadsPerBlock.y);
|
||||
|
||||
tmpToLayers << <numBlocks, threadsPerBlock >> >(dst);
|
||||
}
|
||||
|
||||
template<int N, typename T, int M>
|
||||
void copy(Array2<Vec<N, T>>* out_dst, const TexArray2<N, T, M>& src)
|
||||
{
|
||||
assert(out_dst != 0);
|
||||
const Array2<Vec<N, T>>& dst = *out_dst;
|
||||
assert(dst.width() == src.width);
|
||||
assert(dst.height() == src.height);
|
||||
|
||||
const int numThreadsPerBlock = 16;
|
||||
const dim3 threadsPerBlock = dim3(numThreadsPerBlock, numThreadsPerBlock);
|
||||
const dim3 numBlocks = dim3((dst.width() + threadsPerBlock.x) / threadsPerBlock.x,
|
||||
(dst.height() + threadsPerBlock.y) / threadsPerBlock.y);
|
||||
|
||||
layersToTmp << <numBlocks, threadsPerBlock >> >(src);
|
||||
|
||||
const int dstPitchInBytes = dst.width()*sizeof(Vec<N, T>);
|
||||
checkCudaError(cudaMemcpy2D((void*)dst.data(), dstPitchInBytes, src.tmp_data, src.tmp_pitch, src.width*N*sizeof(T), src.height, cudaMemcpyDeviceToHost));
|
||||
}
|
||||
|
||||
template<int N, typename T, int M>
|
||||
void copy(void** out_dst_data, const TexArray2<N, T, M>& src)
|
||||
{
|
||||
const int numThreadsPerBlock = 16;
|
||||
const dim3 threadsPerBlock = dim3(numThreadsPerBlock, numThreadsPerBlock);
|
||||
const dim3 numBlocks = dim3((src.width + threadsPerBlock.x) / threadsPerBlock.x,
|
||||
(src.height + threadsPerBlock.y) / threadsPerBlock.y);
|
||||
|
||||
layersToTmp << <numBlocks, threadsPerBlock >> >(src);
|
||||
|
||||
const int dstPitchInBytes = src.width*sizeof(Vec<N, T>);
|
||||
checkCudaError(cudaMemcpy2D((void*)*out_dst_data, dstPitchInBytes, src.tmp_data, src.tmp_pitch, src.width*N*sizeof(T), src.height, cudaMemcpyDeviceToHost));
|
||||
}
|
||||
|
||||
#endif
|
||||
Reference in New Issue
Block a user