add the CPU backend

2025-12-15 16:07:46 +01:00 · 2018-09-14 14:01:45 +02:00
parent 7742e68bd1
commit 93beddca07
21 changed files with 4195 additions and 2894 deletions
--- a/README.md
+++ b/README.md
@@ -28,6 +28,7 @@ ebsynth -style <style.png> -guide <source.png> <target.png> -output <output.png>
 -pyramidlevels <number>
 -searchvoteiters <number>
 -patchmatchiters <number>
+-backend [cpu|cuda]
 ```

 ## Download
@@ -129,10 +130,6 @@ equalized to match the luminance of the source painting.

 --------------------------------------------------------------------------

-## Requirements
-
-`ebsynth` needs a CUDA-capable gpu in order to run. Besides CUDA, there are no other external dependencies. A cpu-only version that doesn't require CUDA will be released later.
-
 ## License

 The code is released into the public domain. You can do anything you want with it.
--- a/build-linux-cpu+cuda.sh
+++ b/build-linux-cpu+cuda.sh
@@ -0,0 +1,2 @@
+#!/bin/sh
+nvcc -arch compute_30 src/ebsynth.cpp src/ebsynth_cpu.cpp src/ebsynth_cuda.cu -I"include" -DNDEBUG -D__CORRECT_ISO_CPP11_MATH_H_PROTO -O6 -std=c++11 -w -Xcompiler -fopenmp -o bin/ebsynth
--- a/build-linux-cpu_only.sh
+++ b/build-linux-cpu_only.sh
@@ -0,0 +1,2 @@
+#!/bin/sh
+g++ src/ebsynth.cpp src/ebsynth_cpu.cpp src/ebsynth_nocuda.cpp -DNDEBUG -O6 -fopenmp -I"include" -std=c++11 -o bin/ebsynth
--- a/build-linux.sh
+++ b/build-linux.sh
@@ -1,2 +0,0 @@
-#!/bin/sh
-nvcc -arch compute_30 src/ebsynth.cu -o bin/ebsynth -I "include" -std=c++11 -Xcompiler "-DNDEBUG -O6 -D__CORRECT_ISO_CPP11_MATH_H_PROTO"
--- a/build-win32-cpu+cuda.bat
+++ b/build-win32-cpu+cuda.bat
@@ -0,0 +1,14 @@
+@echo off
+setlocal ENABLEDELAYEDEXPANSION
+
+for %%V in (15,14,12,11) do if exist "!VS%%V0COMNTOOLS!" call "!VS%%V0COMNTOOLS!..\..\VC\vcvarsall.bat" x86 && goto compile
+
+:compile
+nvcc -m32 -arch compute_30 src\ebsynth.cpp src\ebsynth_cpu.cpp src\ebsynth_cuda.cu -DNDEBUG -O6 -I "include" -o "bin\ebsynth.exe" -Xcompiler "/openmp /fp:fast" -Xlinker "/IMPLIB:dummy.lib" -w || goto error
+nvcc -m32 -arch compute_30 src\ebsynth.cpp src\ebsynth_cpu.cpp src\ebsynth_cuda.cu -DNDEBUG -O6 -I "include" -o "bin\ebsynth.dll" -Xcompiler "/openmp /fp:fast" -Xlinker "/IMPLIB:lib\ebsynth.lib" -shared -DEBSYNTH_API=__declspec(dllexport) -w || goto error
+del dummy.lib;dummy.exp 2> NUL
+goto :EOF
+
+:error
+echo FAILED
+@%COMSPEC% /C exit 1 >nul
--- a/build-win32-cpu_only.bat
+++ b/build-win32-cpu_only.bat
@@ -0,0 +1,14 @@
+@echo off
+setlocal ENABLEDELAYEDEXPANSION
+
+for %%V in (15,14,12,11) do if exist "!VS%%V0COMNTOOLS!" call "!VS%%V0COMNTOOLS!..\..\VC\vcvarsall.bat" x86 && goto compile
+
+:compile
+cl src\ebsynth.cpp src\ebsynth_cpu.cpp src\ebsynth_nocuda.cpp /DNDEBUG /O2 /openmp /EHsc /nologo /I"include" /Fe"bin\ebsynth.exe" || goto error
+cl src\ebsynth.cpp src\ebsynth_cpu.cpp src\ebsynth_nocuda.cpp /DNDEBUG /O2 /openmp /EHsc /nologo /I"include" /Fe"bin\ebsynth.dll" /DEBSYNTH_API="__declspec(dllexport)" /link /IMPLIB:"lib\ebsynth.lib" || goto error
+del ebsynth.obj;ebsynth_cpu.obj;ebsynth_nocuda.obj 2> NUL
+goto :EOF
+
+:error
+echo FAILED
+@%COMSPEC% /C exit 1 >nul
--- a/build-win32.bat
+++ b/build-win32.bat
@@ -1,12 +0,0 @@
-@echo off
-setlocal ENABLEDELAYEDEXPANSION
-
-for %%V in (15,14,12,11) do if exist "!VS%%V0COMNTOOLS!" call "!VS%%V0COMNTOOLS!..\..\VC\vcvarsall.bat" x86 && goto compile
-
-:compile
-nvcc -arch compute_30 src\ebsynth.cu -m32 -O6 -w -I "include" -o "bin\ebsynth.exe" -Xcompiler "/DNDEBUG /Ox /Oy /Gy /Oi /fp:fast" -Xlinker "/IMPLIB:\"lib\ebsynth.lib\"" || goto error
-goto :EOF
-
-:error
-echo FAILED
-@%COMSPEC% /C exit 1 >nul
--- a/build-win64-cpu+cuda.bat
+++ b/build-win64-cpu+cuda.bat
@@ -0,0 +1,14 @@
+@echo off
+setlocal ENABLEDELAYEDEXPANSION
+
+for %%V in (15,14,12,11) do if exist "!VS%%V0COMNTOOLS!" call "!VS%%V0COMNTOOLS!..\..\VC\vcvarsall.bat" amd64 && goto compile
+
+:compile
+nvcc -arch compute_30 src\ebsynth.cpp src\ebsynth_cpu.cpp src\ebsynth_cuda.cu -DNDEBUG -O6 -I "include" -o "bin\ebsynth.exe" -Xcompiler "/openmp /fp:fast" -Xlinker "/IMPLIB:dummy.lib" -w || goto error
+nvcc -arch compute_30 src\ebsynth.cpp src\ebsynth_cpu.cpp src\ebsynth_cuda.cu -DNDEBUG -O6 -I "include" -o "bin\ebsynth.dll" -Xcompiler "/openmp /fp:fast" -Xlinker "/IMPLIB:lib\ebsynth.lib" -shared -DEBSYNTH_API=__declspec(dllexport) -w || goto error
+del dummy.lib;dummy.exp 2> NUL
+goto :EOF
+
+:error
+echo FAILED
+@%COMSPEC% /C exit 1 >nul
--- a/build-win64-cpu_only.bat
+++ b/build-win64-cpu_only.bat
@@ -0,0 +1,14 @@
+@echo off
+setlocal ENABLEDELAYEDEXPANSION
+
+for %%V in (15,14,12,11) do if exist "!VS%%V0COMNTOOLS!" call "!VS%%V0COMNTOOLS!..\..\VC\vcvarsall.bat" amd64 && goto compile
+
+:compile
+cl src\ebsynth.cpp src\ebsynth_cpu.cpp src\ebsynth_nocuda.cpp /DNDEBUG /O2 /openmp /EHsc /nologo /I"include" /Fe"bin\ebsynth.exe" || goto error
+cl src\ebsynth.cpp src\ebsynth_cpu.cpp src\ebsynth_nocuda.cpp /DNDEBUG /O2 /openmp /EHsc /nologo /I"include" /Fe"bin\ebsynth.dll" /DEBSYNTH_API="__declspec(dllexport)" /link /IMPLIB:"lib\ebsynth.lib" || goto error
+del ebsynth.obj;ebsynth_cpu.obj;ebsynth_nocuda.obj 2> NUL
+goto :EOF
+
+:error
+echo FAILED
+@%COMSPEC% /C exit 1 >nul
--- a/build-win64.bat
+++ b/build-win64.bat
@@ -1,12 +0,0 @@
-@echo off
-setlocal ENABLEDELAYEDEXPANSION
-
-for %%V in (15,14,12,11) do if exist "!VS%%V0COMNTOOLS!" call "!VS%%V0COMNTOOLS!..\..\VC\vcvarsall.bat" amd64 && goto compile
-
-:compile
-nvcc -arch compute_30 src\ebsynth.cu -m64 -O6 -w -I "include" -o "bin\ebsynth.exe" -Xcompiler "/DNDEBUG /Ox /Oy /Gy /Oi /fp:fast" -Xlinker "/IMPLIB:\"lib\ebsynth.lib\"" || goto error
-goto :EOF
-
-:error
-echo FAILED
-@%COMSPEC% /C exit 1 >nul
--- a/src/ebsynth.cpp
+++ b/src/ebsynth.cpp
@@ -0,0 +1,551 @@
+// This software is in the public domain. Where that dedication is not
+// recognized, you are granted a perpetual, irrevocable license to copy
+// and modify this file as you see fit.
+
+#include "ebsynth.h"
+#include "ebsynth_cpu.h"
+#include "ebsynth_cuda.h"
+
+#include <cstdio>
+#include <cmath>
+
+EBSYNTH_API
+void ebsynthRun(int    ebsynthBackend,
+                int    numStyleChannels,
+                int    numGuideChannels,
+                int    sourceWidth,
+                int    sourceHeight,
+                void*  sourceStyleData,
+                void*  sourceGuideData,
+                int    targetWidth,
+                int    targetHeight,
+                void*  targetGuideData,
+                void*  targetModulationData,
+                float* styleWeights,
+                float* guideWeights,
+                float  uniformityWeight,
+                int    patchSize,
+                int    voteMode,
+                int    numPyramidLevels,
+                int*   numSearchVoteItersPerLevel,
+                int*   numPatchMatchItersPerLevel,
+                int*   stopThresholdPerLevel,
+                void*  outputNnfData,
+                void*  outputImageData)
+{
+  void (*backendDispatch)(int,int,int,int,void*,void*,int,int,void*,void*,float*,float*,float,int,int,int,int*,int*,int*,void*,void*) = 0;
+  
+  if      (ebsynthBackend==EBSYNTH_BACKEND_CPU ) { backendDispatch = ebsynthRunCpu;  }
+  else if (ebsynthBackend==EBSYNTH_BACKEND_CUDA) { backendDispatch = ebsynthRunCuda; }
+  else if (ebsynthBackend==EBSYNTH_BACKEND_AUTO) { backendDispatch = ebsynthBackendAvailableCuda() ? ebsynthRunCuda : ebsynthRunCpu; }
+  
+  if (backendDispatch!=0)
+  {
+    backendDispatch(numStyleChannels,
+                    numGuideChannels,
+                    sourceWidth,
+                    sourceHeight,
+                    sourceStyleData,
+                    sourceGuideData,
+                    targetWidth,
+                    targetHeight,
+                    targetGuideData,
+                    targetModulationData,
+                    styleWeights,
+                    guideWeights,
+                    uniformityWeight,
+                    patchSize,
+                    voteMode,
+                    numPyramidLevels,
+                    numSearchVoteItersPerLevel,
+                    numPatchMatchItersPerLevel,
+                    stopThresholdPerLevel,
+                    outputNnfData,
+                    outputImageData);
+  }
+}
+
+EBSYNTH_API
+int ebsynthBackendAvailable(int ebsynthBackend)
+{
+  if      (ebsynthBackend==EBSYNTH_BACKEND_CPU ) { return ebsynthBackendAvailableCpu();  }
+  else if (ebsynthBackend==EBSYNTH_BACKEND_CUDA) { return ebsynthBackendAvailableCuda(); }
+  else if (ebsynthBackend==EBSYNTH_BACKEND_AUTO) { return ebsynthBackendAvailableCpu() || ebsynthBackendAvailableCuda(); }
+  
+  return 0;
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <cstdio>
+#include <cmath>
+
+#include <vector>
+#include <string>
+#include <algorithm>
+
+#include "jzq.h"
+
+template<typename FUNC>
+bool tryToParseArg(const std::vector<std::string>& args,int* inout_argi,const char* name,bool* out_fail,FUNC handler)
+{
+  int& argi = *inout_argi;
+  bool& fail = *out_fail;
+
+  if (argi<0 || argi>=args.size()) { fail = true; return false; }
+
+  if (args[argi]==name)
+  {
+    argi++;
+    fail = !handler();    
+    return true;
+  }
+
+  fail = false; return false; 
+}
+
+bool tryToParseIntArg(const std::vector<std::string>& args,int* inout_argi,const char* name,int* out_value,bool* out_fail)
+{
+  return tryToParseArg(args,inout_argi,name,out_fail,[&]
+  {
+    int& argi = *inout_argi;
+    if (argi<args.size())
+    {
+      const std::string& arg = args[argi];
+      try
+      {
+        std::size_t pos = 0;
+        *out_value = std::stoi(arg,&pos);
+        if (pos!=arg.size()) { printf("error: bad %s argument '%s'\n",name,arg.c_str()); return false; }
+        return true;
+      }
+      catch(...)
+      {
+        printf("error: bad %s argument '%s'\n",name,arg.c_str());
+        return false;
+      }   
+    }
+    printf("error: missing argument for the %s option\n",name);
+    return false;
+  });
+}
+
+bool tryToParseFloatArg(const std::vector<std::string>& args,int* inout_argi,const char* name,float* out_value,bool* out_fail)
+{
+  return tryToParseArg(args,inout_argi,name,out_fail,[&]
+  {
+    int& argi = *inout_argi;
+    if (argi<args.size())
+    {
+      const std::string& arg = args[argi];
+      try
+      {
+        std::size_t pos = 0;
+        *out_value = std::stof(arg,&pos);
+        if (pos!=arg.size()) { printf("error: bad %s argument '%s'\n",name,arg.c_str()); return false; }
+        return true;
+      }
+      catch(...)
+      {
+        printf("error: bad %s argument '%s'\n",name,args[argi].c_str());
+        return false;
+      }   
+    }
+    printf("error: missing argument for the %s option\n",name);
+    return false;
+  });
+}
+
+bool tryToParseStringArg(const std::vector<std::string>& args,int* inout_argi,const char* name,std::string* out_value,bool* out_fail)
+{
+  return tryToParseArg(args,inout_argi,name,out_fail,[&]
+  {
+    int& argi = *inout_argi;
+    if (argi<args.size())
+    {
+      *out_value = args[argi];
+      return true;
+    }
+    printf("error: missing argument for the %s option\n",name);
+    return false;
+  });
+}
+
+bool tryToParseStringPairArg(const std::vector<std::string>& args,int* inout_argi,const char* name,std::pair<std::string,std::string>* out_value,bool* out_fail)
+{
+  return tryToParseArg(args,inout_argi,name,out_fail,[&]
+  {
+    int& argi = *inout_argi;
+    if ((argi+1)<args.size())
+    {
+      *out_value = std::make_pair(args[argi],args[argi+1]);
+      argi++;
+      return true;
+    }
+    printf("error: missing argument for the %s option\n",name);
+    return false;
+  });
+}
+
+#define STB_IMAGE_IMPLEMENTATION
+#include "stb_image.h"
+
+#define STB_IMAGE_WRITE_IMPLEMENTATION
+#include "stb_image_write.h"
+
+unsigned char* tryLoad(const std::string& fileName,int* width,int* height)
+{
+  unsigned char* data = stbi_load(fileName.c_str(),width,height,NULL,4);
+  if (data==NULL)
+  {
+    printf("error: failed to load '%s'\n",fileName.c_str());
+    printf("%s\n",stbi_failure_reason());
+    exit(1);
+  }
+  return data;
+}
+
+int evalNumChannels(const unsigned char* data,const int numPixels)
+{
+  bool isGray = true;
+  bool hasAlpha = false;
+
+  for(int xy=0;xy<numPixels;xy++)
+  {
+    const unsigned char r = data[xy*4+0];
+    const unsigned char g = data[xy*4+1];
+    const unsigned char b = data[xy*4+2];
+    const unsigned char a = data[xy*4+3];
+
+    if (!(r==g && g==b)) { isGray  = false; }
+    if (a<255)           { hasAlpha = true; }
+  }
+
+  const int numChannels = (isGray ? 1 : 3) + (hasAlpha ? 1 : 0);
+
+  return numChannels;
+}
+
+V2i pyramidLevelSize(const V2i& sizeBase,const int level)
+{
+  return V2i(V2f(sizeBase)*std::pow(2.0f,-float(level)));
+}
+
+std::string backendToString(const int ebsynthBackend)
+{
+  if      (ebsynthBackend==EBSYNTH_BACKEND_CPU)  { return "cpu";  }
+  else if (ebsynthBackend==EBSYNTH_BACKEND_CUDA) { return "cuda"; }
+  else if (ebsynthBackend==EBSYNTH_BACKEND_AUTO) { return "auto"; }
+  return "unknown";
+}
+
+int main(int argc,char** argv)
+{
+  if (argc<2)
+  {
+    printf("usage: %s [options]\n",argv[0]);
+    printf("\n");
+    printf("options:\n");
+    printf("  -style <style.png>\n");
+    printf("  -guide <source.png> <target.png>\n");
+    printf("  -output <output.png>\n");
+    printf("  -weight <value>\n");
+    printf("  -uniformity <value>\n");
+    printf("  -patchsize <size>\n");
+    printf("  -pyramidlevels <number>\n");
+    printf("  -searchvoteiters <number>\n");
+    printf("  -patchmatchiters <number>\n");
+    printf("  -stopthreshold <value>\n");
+    printf("  -backend [cpu|cuda]\n");
+    printf("\n");
+    return 1;
+  }
+
+  std::string styleFileName;
+  float       styleWeight = NAN;
+  std::string outputFileName = "output.png";
+
+  struct Guide
+  {
+    std::string    sourceFileName;
+    std::string    targetFileName;
+    float          weight;
+
+    int            sourceWidth;
+    int            sourceHeight;
+    unsigned char* sourceData;
+
+    int            targetWidth;
+    int            targetHeight;
+    unsigned char* targetData;
+    
+    int            numChannels;
+  };
+
+  std::vector<Guide> guides;
+
+  float uniformityWeight = 3500;
+  int patchSize = 5; 
+  int numPyramidLevels = -1;
+  int numSearchVoteIters = 6;
+  int numPatchMatchIters = 4;
+  int stopThreshold = 5;
+  int backend = ebsynthBackendAvailable(EBSYNTH_BACKEND_CUDA) ? EBSYNTH_BACKEND_CUDA : EBSYNTH_BACKEND_CPU;
+
+  {
+    std::vector<std::string> args(argc);
+    for(int i=0;i<argc;i++) { args[i] = argv[i]; }
+  
+    bool fail = false;
+    int argi = 1;   
+
+    float* precedingStyleOrGuideWeight = 0;
+    while(argi<argc && !fail)
+    {
+      float weight;
+      std::pair<std::string,std::string> guidePair;
+      std::string backendName;
+
+      if      (tryToParseStringArg(args,&argi,"-style",&styleFileName,&fail))
+      {
+        styleWeight = NAN;
+        precedingStyleOrGuideWeight = &styleWeight;
+        argi++;
+      }
+      else if (tryToParseStringPairArg(args,&argi,"-guide",&guidePair,&fail))
+      {
+        Guide guide;
+        guide.sourceFileName = guidePair.first;
+        guide.targetFileName = guidePair.second;
+        guide.weight = NAN;
+        guides.push_back(guide);
+        precedingStyleOrGuideWeight = &guides[guides.size()-1].weight;
+        argi++;
+      }
+      else if (tryToParseStringArg(args,&argi,"-output",&outputFileName,&fail))
+      {
+        argi++;
+      }
+      else if (tryToParseFloatArg(args,&argi,"-weight",&weight,&fail))
+      {
+        if (precedingStyleOrGuideWeight!=0) { *precedingStyleOrGuideWeight = weight; }
+        else { printf("error: at least one -style or -guide option must precede the -weight option!\n"); return 1; }
+        argi++;
+      }
+      else if (tryToParseFloatArg(args,&argi,"-uniformity",&uniformityWeight,&fail)) { argi++; }
+      else if (tryToParseIntArg(args,&argi,"-patchsize",&patchSize,&fail))
+      {
+        if (patchSize<3)    { printf("error: patchsize is too small!\n"); return 1; }
+        if (patchSize%2==0) { printf("error: patchsize must be an odd number!\n"); return 1; }
+        argi++;
+      }
+      else if (tryToParseIntArg(args,&argi,"-pyramidlevels",&numPyramidLevels,&fail))
+      {
+        if (numPyramidLevels<1) { printf("error: bad argument for -pyramidlevels!\n"); return 1; }
+        argi++;
+      }
+      else if (tryToParseIntArg(args,&argi,"-searchvoteiters",&numSearchVoteIters,&fail))
+      {
+        if (numSearchVoteIters<0) { printf("error: bad argument for -searchvoteiters!\n"); return 1; }
+        argi++;
+      }
+      else if (tryToParseIntArg(args,&argi,"-patchmatchiters",&numPatchMatchIters,&fail))
+      {
+        if (numPatchMatchIters<0) { printf("error: bad argument for -patchmatchiters!\n"); return 1; }
+        argi++;
+      }
+      else if (tryToParseIntArg(args,&argi,"-stopthreshold",&stopThreshold,&fail))
+      {
+        if (stopThreshold<0) { printf("error: bad argument for -stopthreshold!\n"); return 1; }
+        argi++;
+      }
+      else if (tryToParseStringArg(args,&argi,"-backend",&backendName,&fail))
+      {
+        if      (backendName=="cpu" ) { backend = EBSYNTH_BACKEND_CPU; }
+        else if (backendName=="cuda") { backend = EBSYNTH_BACKEND_CUDA; }
+        else { printf("error: unrecognized backend '%s'\n",backendName.c_str()); return 1; }
+
+        if (!ebsynthBackendAvailable(backend)) { printf("error: the %s backend is not available!\n",backendToString(backend).c_str()); return 1; }
+
+        argi++;
+      }
+      else
+      {
+        printf("error: unrecognized option '%s'\n",args[argi].c_str());
+        fail = true;
+      }
+
+    }
+    
+    if (fail) { return 1; }
+  }
+
+  const int numGuides = guides.size();
+
+  int sourceWidth = 0;
+  int sourceHeight = 0;
+  unsigned char* sourceStyleData = tryLoad(styleFileName,&sourceWidth,&sourceHeight);
+  const int numStyleChannelsTotal = evalNumChannels(sourceStyleData,sourceWidth*sourceHeight);
+
+  std::vector<unsigned char> sourceStyle(sourceWidth*sourceHeight*numStyleChannelsTotal);
+  for(int xy=0;xy<sourceWidth*sourceHeight;xy++)
+  {
+    if      (numStyleChannelsTotal>0)  { sourceStyle[xy*numStyleChannelsTotal+0] = sourceStyleData[xy*4+0]; }
+    if      (numStyleChannelsTotal==2) { sourceStyle[xy*numStyleChannelsTotal+1] = sourceStyleData[xy*4+3]; }           
+    else if (numStyleChannelsTotal>1)  { sourceStyle[xy*numStyleChannelsTotal+1] = sourceStyleData[xy*4+1]; }
+    if      (numStyleChannelsTotal>2)  { sourceStyle[xy*numStyleChannelsTotal+2] = sourceStyleData[xy*4+2]; }
+    if      (numStyleChannelsTotal>3)  { sourceStyle[xy*numStyleChannelsTotal+3] = sourceStyleData[xy*4+3]; }                 
+  }
+  
+  int targetWidth = 0;
+  int targetHeight = 0;
+  int numGuideChannelsTotal = 0;
+
+  for(int i=0;i<numGuides;i++)
+  {
+    Guide& guide = guides[i];
+
+    guide.sourceData = tryLoad(guide.sourceFileName,&guide.sourceWidth,&guide.sourceHeight);
+    guide.targetData = tryLoad(guide.targetFileName,&guide.targetWidth,&guide.targetHeight);
+      
+    if              (guide.sourceWidth!=sourceWidth || guide.sourceHeight!=sourceHeight)  { printf("error: source guide '%s' doesn't match the resolution of '%s'\n",guide.sourceFileName.c_str(),styleFileName.c_str()); return 1; }      
+    if      (i>0 && (guide.targetWidth!=targetWidth || guide.targetHeight!=targetHeight)) { printf("error: target guide '%s' doesn't match the resolution of '%s'\n",guide.targetFileName.c_str(),guides[0].targetFileName.c_str()); return 1; }
+    else if (i==0) { targetWidth = guide.targetWidth; targetHeight = guide.targetHeight; }
+
+    guide.numChannels = std::max(evalNumChannels(guide.sourceData,sourceWidth*sourceHeight),
+                                 evalNumChannels(guide.targetData,targetWidth*targetHeight));    
+  
+    numGuideChannelsTotal += guide.numChannels;
+  }
+  
+  if (numStyleChannelsTotal>EBSYNTH_MAX_STYLE_CHANNELS) { printf("error: too many style channels (%d), maximum number is %d\n",numStyleChannelsTotal,EBSYNTH_MAX_STYLE_CHANNELS); return 1; }
+  if (numGuideChannelsTotal>EBSYNTH_MAX_GUIDE_CHANNELS) { printf("error: too many guide channels (%d), maximum number is %d\n",numGuideChannelsTotal,EBSYNTH_MAX_GUIDE_CHANNELS); return 1; }
+
+  std::vector<unsigned char> sourceGuides(sourceWidth*sourceHeight*numGuideChannelsTotal);
+  for(int xy=0;xy<sourceWidth*sourceHeight;xy++)
+  {
+    int c = 0;
+    for(int i=0;i<numGuides;i++)
+    { 
+      const int numChannels = guides[i].numChannels;  
+
+      if      (numChannels>0)  { sourceGuides[xy*numGuideChannelsTotal+c+0] = guides[i].sourceData[xy*4+0]; }
+      if      (numChannels==2) { sourceGuides[xy*numGuideChannelsTotal+c+1] = guides[i].sourceData[xy*4+3]; }           
+      else if (numChannels>1)  { sourceGuides[xy*numGuideChannelsTotal+c+1] = guides[i].sourceData[xy*4+1]; }
+      if      (numChannels>2)  { sourceGuides[xy*numGuideChannelsTotal+c+2] = guides[i].sourceData[xy*4+2]; }
+      if      (numChannels>3)  { sourceGuides[xy*numGuideChannelsTotal+c+3] = guides[i].sourceData[xy*4+3]; }            
+      
+      c += numChannels;
+    }
+  }
+
+  std::vector<unsigned char> targetGuides(targetWidth*targetHeight*numGuideChannelsTotal);
+  for(int xy=0;xy<targetWidth*targetHeight;xy++)
+  {
+    int c = 0;
+    for(int i=0;i<numGuides;i++)
+    { 
+      const int numChannels = guides[i].numChannels;  
+
+      if      (numChannels>0)  { targetGuides[xy*numGuideChannelsTotal+c+0] = guides[i].targetData[xy*4+0]; }
+      if      (numChannels==2) { targetGuides[xy*numGuideChannelsTotal+c+1] = guides[i].targetData[xy*4+3]; }           
+      else if (numChannels>1)  { targetGuides[xy*numGuideChannelsTotal+c+1] = guides[i].targetData[xy*4+1]; }
+      if      (numChannels>2)  { targetGuides[xy*numGuideChannelsTotal+c+2] = guides[i].targetData[xy*4+2]; }
+      if      (numChannels>3)  { targetGuides[xy*numGuideChannelsTotal+c+3] = guides[i].targetData[xy*4+3]; }            
+      
+      c += numChannels;
+    }
+  }
+
+  std::vector<float> styleWeights(numStyleChannelsTotal);
+  if (isnan(styleWeight)) { styleWeight = 1.0f; }
+  for(int i=0;i<numStyleChannelsTotal;i++) { styleWeights[i] = styleWeight / float(numStyleChannelsTotal); }
+
+  for(int i=0;i<numGuides;i++) { if (isnan(guides[i].weight)) { guides[i].weight = 1.0f/float(numGuides); } }
+
+  std::vector<float> guideWeights(numGuideChannelsTotal);
+  {
+    int c = 0;
+    for(int i=0;i<numGuides;i++)
+    { 
+      const int numChannels = guides[i].numChannels;  
+      
+      for(int j=0;j<numChannels;j++)
+      {
+        guideWeights[c+j] = guides[i].weight / float(numChannels);
+      }
+
+      c += numChannels; 
+    }
+  }
+
+  int maxPyramidLevels = 0;
+  for(int level=32;level>=0;level--)
+  {
+    if (min(pyramidLevelSize(std::min(V2i(sourceWidth,sourceHeight),V2i(targetWidth,targetHeight)),level)) >= (2*patchSize+1))
+    {
+      maxPyramidLevels = level+1;
+      break;
+    }
+  }
+
+  if (numPyramidLevels==-1) { numPyramidLevels = maxPyramidLevels; }
+  numPyramidLevels = std::min(numPyramidLevels,maxPyramidLevels); 
+
+  std::vector<int> numSearchVoteItersPerLevel(numPyramidLevels);
+  std::vector<int> numPatchMatchItersPerLevel(numPyramidLevels);
+  std::vector<int> stopThresholdPerLevel(numPyramidLevels);
+  for(int i=0;i<numPyramidLevels;i++)
+  {
+    numSearchVoteItersPerLevel[i] = numSearchVoteIters;
+    numPatchMatchItersPerLevel[i] = numPatchMatchIters;
+    stopThresholdPerLevel[i] = stopThreshold;
+  }
+
+  std::vector<unsigned char> output(targetWidth*targetHeight*numStyleChannelsTotal);
+
+  printf("uniformity: %.0f\n",uniformityWeight);
+  printf("patchsize: %d\n",patchSize);
+  printf("pyramidlevels: %d\n",numPyramidLevels);
+  printf("searchvoteiters: %d\n",numSearchVoteIters);
+  printf("patchmatchiters: %d\n",numPatchMatchIters);
+  printf("stopthreshold: %d\n",stopThreshold);
+  printf("backend: %s\n",backendToString(backend).c_str());
+
+  ebsynthRun(backend,
+             numStyleChannelsTotal,
+             numGuideChannelsTotal,
+             sourceWidth,
+             sourceHeight,
+             sourceStyle.data(),
+             sourceGuides.data(),
+             targetWidth,
+             targetHeight,
+             targetGuides.data(),
+             NULL,
+             styleWeights.data(),
+             guideWeights.data(),
+             uniformityWeight,
+             patchSize,
+             EBSYNTH_VOTEMODE_PLAIN,
+             numPyramidLevels,
+             numSearchVoteItersPerLevel.data(),
+             numPatchMatchItersPerLevel.data(),
+             stopThresholdPerLevel.data(),
+             NULL,
+             output.data());
+
+  stbi_write_png(outputFileName.c_str(),targetWidth,targetHeight,numStyleChannelsTotal,output.data(),numStyleChannelsTotal*targetWidth);
+
+  printf("result was written to %s\n",outputFileName.c_str());
+
+  stbi_image_free(sourceStyleData);
+
+  for(int i=0;i<numGuides;i++)
+  {
+    stbi_image_free(guides[i].sourceData);
+    stbi_image_free(guides[i].targetData);
+  }
+  
+  return 0;
+}
--- a/src/ebsynth_cpu.cpp
+++ b/src/ebsynth_cpu.cpp
--- a/src/ebsynth_cpu.h
+++ b/src/ebsynth_cpu.h
@@ -0,0 +1,32 @@
+// This software is in the public domain. Where that dedication is not
+// recognized, you are granted a perpetual, irrevocable license to copy
+// and modify this file as you see fit.
+
+#ifndef EBSYNTH_CPU_H_
+#define EBSYNTH_CPU_H_
+
+void ebsynthRunCpu(int    numStyleChannels,
+                   int    numGuideChannels,
+                   int    sourceWidth,
+                   int    sourceHeight,
+                   void*  sourceStyleData,
+                   void*  sourceGuideData,
+                   int    targetWidth,
+                   int    targetHeight,
+                   void*  targetGuideData,
+                   void*  targetModulationData,
+                   float* styleWeights,
+                   float* guideWeights,
+                   float  uniformityWeight,
+                   int    patchSize,
+                   int    voteMode,
+                   int    numPyramidLevels,
+                   int*   numSearchVoteItersPerLevel,
+                   int*   numPatchMatchItersPerLevel,
+                   int*   stopThresholdPerLevel,
+                   void*  outputNnfData,
+                   void*  outputImageData);
+
+int ebsynthBackendAvailableCpu();
+
+#endif
--- a/src/ebsynth_cuda.cu
+++ b/src/ebsynth_cuda.cu
--- a/src/ebsynth_cuda.h
+++ b/src/ebsynth_cuda.h
@@ -0,0 +1,32 @@
+// This software is in the public domain. Where that dedication is not
+// recognized, you are granted a perpetual, irrevocable license to copy
+// and modify this file as you see fit.
+
+#ifndef EBSYNTH_CUDA_H_
+#define EBSYNTH_CUDA_H_
+
+void ebsynthRunCuda(int    numStyleChannels,
+                    int    numGuideChannels,
+                    int    sourceWidth,
+                    int    sourceHeight,
+                    void*  sourceStyleData,
+                    void*  sourceGuideData,
+                    int    targetWidth,
+                    int    targetHeight,
+                    void*  targetGuideData,
+                    void*  targetModulationData,
+                    float* styleWeights,
+                    float* guideWeights,
+                    float  uniformityWeight,
+                    int    patchSize,
+                    int    voteMode,
+                    int    numPyramidLevels,
+                    int*   numSearchVoteItersPerLevel,
+                    int*   numPatchMatchItersPerLevel,
+                    int*   stopThresholdPerLevel,
+                    void*  outputNnfData,
+                    void*  outputImageData);
+
+int ebsynthBackendAvailableCuda();
+
+#endif
--- a/src/ebsynth_cuda_check.h
+++ b/src/ebsynth_cuda_check.h
@@ -1,5 +1,5 @@
-#ifndef CUDACHECK_H_
-#define CUDACHECK_H_
+#ifndef EBSYNTH_CUDA_CHECK_H_
+#define EBSYNTH_CUDA_CHECK_H_

 template<typename T>
 bool checkCudaError_(T result,char const* const func,const char* const file,int const line)
--- a/src/ebsynth_cuda_memarray2.h
+++ b/src/ebsynth_cuda_memarray2.h
@@ -2,11 +2,11 @@
 // recognized, you are granted a perpetual, irrevocable license to copy
 // and modify this file as you see fit.

-#ifndef MEMARRAY2_H_
-#define MEMARRAY2_H_
+#ifndef EBSYNTH_CUDA_MEMARRAY2_H_
+#define EBSYNTH_CUDA_MEMARRAY2_H_

 #include "jzq.h"
-//#include "cudacheck.h"
+#include "ebsynth_cuda_check.h"

 template<typename T>
 struct MemArray2
--- a/src/ebsynth_cuda_texarray2.h
+++ b/src/ebsynth_cuda_texarray2.h
@@ -2,11 +2,11 @@
 // recognized, you are granted a perpetual, irrevocable license to copy
 // and modify this file as you see fit.

-#ifndef TEXARRAY2_H_
-#define TEXARRAY2_H_
+#ifndef EBSYNTH_CUDA_TEXARRAY2_H_
+#define EBSYNTH_CUDA_TEXARRAY2_H_

 #include "jzq.h"
-#include "cudacheck.h"
+#include "ebsynth_cuda_check.h"

 #include <cuda_runtime.h>

--- a/src/ebsynth_nocuda.cpp
+++ b/src/ebsynth_nocuda.cpp
@@ -0,0 +1,33 @@
+// This software is in the public domain. Where that dedication is not
+// recognized, you are granted a perpetual, irrevocable license to copy
+// and modify this file as you see fit.
+
+void ebsynthRunCuda(int    numStyleChannels,
+                    int    numGuideChannels,
+                    int    sourceWidth,
+                    int    sourceHeight,
+                    void*  sourceStyleData,
+                    void*  sourceGuideData,
+                    int    targetWidth,
+                    int    targetHeight,
+                    void*  targetGuideData,
+                    void*  targetModulationData,
+                    float* styleWeights,
+                    float* guideWeights,
+                    float  uniformityWeight,
+                    int    patchSize,
+                    int    voteMode,
+                    int    numPyramidLevels,
+                    int*   numSearchVoteItersPerLevel,
+                    int*   numPatchMatchItersPerLevel,
+                    int*   stopThresholdPerLevel,
+                    void*  outputNnfData,
+                    void*  outputImageData)
+{
+
+}
+
+int ebsynthBackendAvailableCuda()
+{
+  return 0;
+}
--- a/src/jzq.h
+++ b/src/jzq.h
--- a/src/patchmatch_gpu.h
+++ b/src/patchmatch_gpu.h
@@ -1,410 +0,0 @@
-// This software is in the public domain. Where that dedication is not
-// recognized, you are granted a perpetual, irrevocable license to copy
-// and modify this file as you see fit.
-
-#ifndef PATCHMATCH_GPU_H_
-#define PATCHMATCH_GPU_H_
-
-#include <stdint.h>
-#include <cfloat>
-
-#include "texarray2.h"
-#include "memarray2.h"
-
-struct pcgState
-{
-  uint64_t state;
-  uint64_t increment;
-};
-
-__device__ void pcgAdvance(pcgState* rng)
-{
-  rng->state = rng->state * 6364136223846793005ULL + rng->increment;
-}
-
-__device__ uint32_t pcgOutput(uint64_t state)
-{
-  return (uint32_t)(((state >> 22u) ^ state) >> ((state >> 61u) + 22u));
-}
-
-__device__ uint32_t pcgRand(pcgState* rng)
-{
-  uint64_t oldstate = rng->state;
-  pcgAdvance(rng);
-  return pcgOutput(oldstate);
-}
-
-__device__ void pcgInit(pcgState* rng,uint64_t seed,uint64_t stream)
-{
-  rng->state = 0U;
-  rng->increment = (stream << 1u) | 1u;
-  pcgAdvance(rng);
-  rng->state += seed;
-  pcgAdvance(rng);
-}
-
-typedef Vec<1,float> V1f;
-typedef Array2<Vec<1,float>> A2V1f;
-
-__global__ void krnlInitRngStates(const int width,
-                                  const int height,
-                                  pcgState* rngStates)
-{
-  const int x = blockDim.x*blockIdx.x + threadIdx.x;
-  const int y = blockDim.y*blockIdx.y + threadIdx.y;
-
-  if (x<width && y<height)
-  {
-    const int idx = x+y*width;
-    pcgInit(&rngStates[idx],1337,idx);    
-  }
-}
-
-pcgState* initGpuRng(const int width,
-                     const int height)
-{
-  pcgState* gpuRngStates;
-  cudaMalloc(&gpuRngStates,width*height*sizeof(pcgState));
-
-  const dim3 threadsPerBlock(16,16);
-  const dim3 numBlocks((width+threadsPerBlock.x)/threadsPerBlock.x,
-                       (height+threadsPerBlock.y)/threadsPerBlock.y);
-
-  krnlInitRngStates<<<numBlocks,threadsPerBlock>>>(width,height,gpuRngStates);
-
-  return gpuRngStates;
-}
-
-template<int N,typename T,int M>
-struct PatchSSD
-{
-  const TexArray2<N,T,M> A;
-  const TexArray2<N,T,M> B;
-  const Vec<N,float> weights;
-
-  PatchSSD(const TexArray2<N,T,M>& A,
-           const TexArray2<N,T,M>& B,
-           const Vec<N,float>& weights)
-
-  : A(A),B(B),weights(weights) {}
-
-   __device__ float operator()(int patchWidth,
-                               const int ax,
-                               const int ay,
-                               const int bx,
-                               const int by,
-                               const float ebest)
-   {
-    const int hpw = patchWidth/2;
-    float ssd = 0;
-
-    for(int py=-hpw;py<=+hpw;py++)
-    {
-      for(int px=-hpw;px<=+hpw;px++)
-      {
-        const Vec<N,T> pixelA = A(ax + px, ay + py);
-        const Vec<N,T> pixelB = B(bx + px, by + py);
-        for(int i=0;i<N;i++)
-        {
-          const float diff = float(pixelA[i])-float(pixelB[i]);
-          ssd += weights[i]*diff*diff;
-        }
-      }
-
-      if (ssd>ebest) { return ssd; }
-    }
-
-    return ssd;
-   }
-};
-
-template<typename FUNC>
-__global__ void krnlEvalErrorPass(const int patchWidth,
-                                  FUNC patchError,
-                                  const TexArray2<2,int> NNF,
-                                  TexArray2<1,float> E)
-{
-  const int x = blockDim.x*blockIdx.x + threadIdx.x;
-  const int y = blockDim.y*blockIdx.y + threadIdx.y;
-
-  if (x<NNF.width && y<NNF.height)
-  {
-    const V2i n = NNF(x,y);
-    E.write(x,y,V1f(patchError(patchWidth,x,y,n[0],n[1],FLT_MAX)));
-  }
-}
-
-void __device__ updateOmega(MemArray2<int>& Omega,const int patchWidth,const int bx,const int by,const int incdec)
-{
-  const int r = patchWidth/2;
-
-  for(int oy=-r;oy<=+r;oy++)
-  for(int ox=-r;ox<=+r;ox++)
-  {
-    const int x = bx+ox;
-    const int y = by+oy;
-    atomicAdd(&Omega.data[x+y*Omega.width],incdec);
-    //Omega.data[x+y*Omega.width] += incdec;
-  }
-}
-
-int __device__ patchOmega(const int patchWidth,const int bx,const int by,const MemArray2<int>& Omega)
-{
-  const int r = patchWidth/2;
-
-  int sum = 0;
-
-  for(int oy=-r;oy<=+r;oy++)
-  for(int ox=-r;ox<=+r;ox++)
-  {
-    const int x = bx+ox;
-    const int y = by+oy;
-    sum += Omega.data[x+y*Omega.width]; /// XXX: atomic read instead ??
-  }
-
-  return sum;
-}
-
-template<typename FUNC>
-__device__ void tryPatch(const  V2i& sizeA,
-                         const  V2i& sizeB,
-                                MemArray2<int>& Omega,
-                         const  int patchWidth,
-                         FUNC   patchError,
-                         const  float lambda,
-                         const  int ax,
-                         const  int ay,
-                         const  int bx,
-                         const  int by,
-                         V2i&   nbest,
-                         float& ebest)
-{
-  const float omegaBest = (float(sizeA(0)*sizeA(1)) /
-                           float(sizeB(0)*sizeB(1))) * float(patchWidth*patchWidth);
-
-  const float curOcc = (float(patchOmega(patchWidth,nbest(0),nbest(1),Omega))/float(patchWidth*patchWidth))/omegaBest;
-  const float newOcc = (float(patchOmega(patchWidth,      bx,      by,Omega))/float(patchWidth*patchWidth))/omegaBest;
-
-  const float curErr = ebest;
-  const float newErr = patchError(patchWidth,ax,ay,bx,by,curErr+lambda*curOcc);
-
-  if ((newErr+lambda*newOcc) < (curErr+lambda*curOcc))
-  {
-    updateOmega(Omega,patchWidth,      bx,      by,+1);
-    updateOmega(Omega,patchWidth,nbest(0),nbest(1),-1);
-    nbest = V2i(bx,by);
-    ebest = newErr;
-  }
-}
-
-template<typename FUNC>
-__device__ void tryNeighborsOffset(const int x,
-                                   const int y,
-                                   const int ox,
-                                   const int oy,
-                                   V2i& nbest,
-                                   float& ebest,
-                                   const V2i& sizeA,
-                                   const V2i& sizeB,
-                                         MemArray2<int>& Omega,
-                                   const int patchWidth,
-                                   FUNC patchError,
-                                   const float lambda,
-                                   const TexArray2<2,int>& NNF)
-{
-  const int hpw = patchWidth/2;
-
-  const V2i on = NNF(x+ox,y+oy);
-  const int nx = on(0)-ox;
-  const int ny = on(1)-oy;
-
-  if (nx>=hpw && nx<sizeB(0)-hpw &&
-      ny>=hpw && ny<sizeB(1)-hpw)
-  {
-    tryPatch(sizeA,sizeB,Omega,patchWidth,patchError,lambda,x,y,nx,ny,nbest,ebest);
-  }
-}
-
-template<typename FUNC>
-__global__ void krnlPropagationPass(const V2i sizeA,
-                                    const V2i sizeB,
-                                          MemArray2<int> Omega,
-                                    const int patchWidth,
-                                    FUNC  patchError,
-                                    const float lambda,
-                                    const int r,
-                                    const TexArray2<2,int> NNF,
-                                    TexArray2<2,int> NNF2,
-                                    TexArray2<1,float> E,
-                                    TexArray2<1,unsigned char> mask)
-{
-  const int x = blockDim.x*blockIdx.x + threadIdx.x;
-  const int y = blockDim.y*blockIdx.y + threadIdx.y;
-
-  if (x<sizeA(0) && y<sizeA(1))
-  {
-    V2i   nbest = NNF(x,y);
-    float ebest = E(x,y)(0);
-
-    if (mask(x,y)[0]==255)
-    {
-      tryNeighborsOffset(x,y,-r,0,nbest,ebest,sizeA,sizeB,Omega,patchWidth,patchError,lambda,NNF);
-      tryNeighborsOffset(x,y,+r,0,nbest,ebest,sizeA,sizeB,Omega,patchWidth,patchError,lambda,NNF);
-      tryNeighborsOffset(x,y,0,-r,nbest,ebest,sizeA,sizeB,Omega,patchWidth,patchError,lambda,NNF);
-      tryNeighborsOffset(x,y,0,+r,nbest,ebest,sizeA,sizeB,Omega,patchWidth,patchError,lambda,NNF);
-    }
-
-    E.write(x,y,V1f(ebest));
-    NNF2.write(x,y,nbest);
-  }
-}
-
-template<typename FUNC>
-__device__ void tryRandomOffsetInRadius(const int r,
-                                        const V2i& sizeA,
-                                        const V2i& sizeB,
-                                              MemArray2<int>& Omega,
-                                        const int patchWidth,
-                                        FUNC  patchError,
-                                        const float lambda,
-                                        const int x,
-                                        const int y,
-                                        const V2i& norg,
-                                        V2i&  nbest,
-                                        float& ebest,
-                                        pcgState* rngState)
-{
-  const int hpw = patchWidth/2;
-
-  const int xmin = max(norg(0)-r,hpw);
-  const int xmax = min(norg(0)+r,sizeB(0)-1-hpw);
-  const int ymin = max(norg(1)-r,hpw);
-  const int ymax = min(norg(1)+r,sizeB(1)-1-hpw);
-
-  const int nx = xmin+(pcgRand(rngState)%(xmax-xmin+1));
-  const int ny = ymin+(pcgRand(rngState)%(ymax-ymin+1));
-
-  tryPatch(sizeA,sizeB,Omega,patchWidth,patchError,lambda,x,y,nx,ny,nbest,ebest);
-}
-
-/*
-template<typename FUNC>
-__global__ void krnlRandomSearchPass(const V2i sizeA,
-                                     const V2i sizeB,
-                                     MemArray2<int> Omega,
-                                     const int patchWidth,
-                                     FUNC  patchError,
-                                     const float lambda,
-                                     TexArray2<2,int> NNF,
-                                     TexArray2<1,float> E,
-                                     TexArray2<1,unsigned char> mask,
-                                     pcgState* rngStates)
-{
-  const int x = blockDim.x*blockIdx.x + threadIdx.x;
-  const int y = blockDim.y*blockIdx.y + threadIdx.y;
-
-  if (x<sizeA(0) && y<sizeA(1))
-  {
-    if (mask(x,y)[0]==255)
-    {
-      V2i nbest = NNF(x,y);
-      float ebest = E(x,y)(0);
-
-      const V2i norg = nbest;
-
-      for(int r=1;r<max(sizeB(0),sizeB(1))/2;r=r*2)
-      {
-        tryRandomOffsetInRadius(r,sizeA,sizeB,Omega,patchWidth,patchError,lambda,x,y,norg,nbest,ebest,&rngStates[x+y*NNF.width]);
-      }
-
-      E.write(x,y,V1f(ebest));
-      NNF.write(x,y,nbest);
-    }
-  }
-}
-*/
-
-template<typename FUNC>
-__global__ void krnlRandomSearchPass(const V2i sizeA,
-                                     const V2i sizeB,
-                                     MemArray2<int> Omega,
-                                     const int patchWidth,
-                                     FUNC  patchError,
-                                     const float lambda,
-                                     const int radius,
-                                     TexArray2<2,int> NNF,
-                                     TexArray2<1,float> E,
-                                     TexArray2<1,unsigned char> mask,
-                                     pcgState* rngStates)
-{
-  const int x = blockDim.x*blockIdx.x + threadIdx.x;
-  const int y = blockDim.y*blockIdx.y + threadIdx.y;
-
-  if (x<sizeA(0) && y<sizeA(1))
-  {
-    if (mask(x,y)[0]==255)
-    {
-      V2i nbest = NNF(x,y);
-      float ebest = E(x,y)(0);
-
-      const V2i norg = nbest;
-
-      tryRandomOffsetInRadius(radius,sizeA,sizeB,Omega,patchWidth,patchError,lambda,x,y,norg,nbest,ebest,&rngStates[x+y*NNF.width]);
-
-      E.write(x,y,V1f(ebest));
-      NNF.write(x,y,nbest);
-    }
-  }
-}
-
-template<typename FUNC>
-void patchmatchGPU(const V2i sizeA,
-                   const V2i sizeB,
-                   MemArray2<int>& Omega,
-                   const int patchWidth,
-                   FUNC patchError,
-                   const float lambda,
-                   const int numIters,
-                   const int numThreadsPerBlock,
-                   TexArray2<2,int>& NNF,
-                   TexArray2<2,int>& NNF2,
-                   TexArray2<1,float>& E,
-                   TexArray2<1,unsigned char>& mask,
-                   pcgState* rngStates)
-{
-  const dim3 threadsPerBlock = dim3(numThreadsPerBlock,numThreadsPerBlock);
-  const dim3 numBlocks = dim3((NNF.width+threadsPerBlock.x)/threadsPerBlock.x,
-                              (NNF.height+threadsPerBlock.y)/threadsPerBlock.y);
-
-  krnlEvalErrorPass<<<numBlocks,threadsPerBlock>>>(patchWidth,patchError,NNF,E);
-
-  checkCudaError(cudaDeviceSynchronize());
-
-  for(int i=0;i<numIters;i++)
-  {
-    krnlPropagationPass<<<numBlocks,threadsPerBlock>>>(sizeA,sizeB,Omega,patchWidth,patchError,lambda,4,NNF,NNF2,E,mask); std::swap(NNF,NNF2);
-
-    checkCudaError(cudaDeviceSynchronize());
-
-    krnlPropagationPass<<<numBlocks,threadsPerBlock>>>(sizeA,sizeB,Omega,patchWidth,patchError,lambda,2,NNF,NNF2,E,mask); std::swap(NNF,NNF2);
-
-    checkCudaError(cudaDeviceSynchronize());
-
-    krnlPropagationPass<<<numBlocks,threadsPerBlock>>>(sizeA,sizeB,Omega,patchWidth,patchError,lambda,1,NNF,NNF2,E,mask); std::swap(NNF,NNF2);
-
-    checkCudaError(cudaDeviceSynchronize());
-
-    for(int r=1;r<max(sizeB(0),sizeB(1))/2;r=r*2)
-    {
-      krnlRandomSearchPass<<<numBlocks,threadsPerBlock>>>(sizeA,sizeB,Omega,patchWidth,patchError,lambda,r,NNF,E,mask,rngStates);
-    }
-
-    checkCudaError(cudaDeviceSynchronize());
-  }
-
-  krnlEvalErrorPass<<<numBlocks,threadsPerBlock>>>(patchWidth,patchError,NNF,E);
-
-  checkCudaError(cudaDeviceSynchronize());
-}
-
-#endif