diff --git a/README.md b/README.md
index b2be42b..ba2c028 100644
--- a/README.md
+++ b/README.md
@@ -28,6 +28,7 @@ ebsynth -style <style.png> -guide <source.png> <target.png> -output <output.png>
 -pyramidlevels <number>
 -searchvoteiters <number>
 -patchmatchiters <number>
+-backend [cpu|cuda]
 ```
 
 ## Download
@@ -129,10 +130,6 @@ equalized to match the luminance of the source painting.
 
 --------------------------------------------------------------------------
 
-## Requirements
-
-`ebsynth` needs a CUDA-capable gpu in order to run. Besides CUDA, there are no other external dependencies. A cpu-only version that doesn't require CUDA will be released later.
-
 ## License
 
 The code is released into the public domain. You can do anything you want with it.
diff --git a/build-linux-cpu+cuda.sh b/build-linux-cpu+cuda.sh
new file mode 100755
index 0000000..8e9d2c3
--- /dev/null
+++ b/build-linux-cpu+cuda.sh
@@ -0,0 +1,2 @@
+#!/bin/sh
+nvcc -arch compute_30 src/ebsynth.cpp src/ebsynth_cpu.cpp src/ebsynth_cuda.cu -I"include" -DNDEBUG -D__CORRECT_ISO_CPP11_MATH_H_PROTO -O6 -std=c++11 -w -Xcompiler -fopenmp -o bin/ebsynth
diff --git a/build-linux-cpu_only.sh b/build-linux-cpu_only.sh
new file mode 100755
index 0000000..01d419b
--- /dev/null
+++ b/build-linux-cpu_only.sh
@@ -0,0 +1,2 @@
+#!/bin/sh
+g++ src/ebsynth.cpp src/ebsynth_cpu.cpp src/ebsynth_nocuda.cpp -DNDEBUG -O6 -fopenmp -I"include" -std=c++11 -o bin/ebsynth
diff --git a/build-linux.sh b/build-linux.sh
deleted file mode 100755
index 9c6185e..0000000
--- a/build-linux.sh
+++ /dev/null
@@ -1,2 +0,0 @@
-#!/bin/sh
-nvcc -arch compute_30 src/ebsynth.cu -o bin/ebsynth -I "include" -std=c++11 -Xcompiler "-DNDEBUG -O6 -D__CORRECT_ISO_CPP11_MATH_H_PROTO"
diff --git a/build-win32-cpu+cuda.bat b/build-win32-cpu+cuda.bat
new file mode 100644
index 0000000..19a3b6a
--- /dev/null
+++ b/build-win32-cpu+cuda.bat
@@ -0,0 +1,14 @@
+@echo off
+setlocal ENABLEDELAYEDEXPANSION
+
+for %%V in (15,14,12,11) do if exist "!VS%%V0COMNTOOLS!" call "!VS%%V0COMNTOOLS!..\..\VC\vcvarsall.bat" x86 && goto compile
+
+:compile
+nvcc -m32 -arch compute_30 src\ebsynth.cpp src\ebsynth_cpu.cpp src\ebsynth_cuda.cu -DNDEBUG -O6 -I "include" -o "bin\ebsynth.exe" -Xcompiler "/openmp /fp:fast" -Xlinker "/IMPLIB:dummy.lib" -w || goto error
+nvcc -m32 -arch compute_30 src\ebsynth.cpp src\ebsynth_cpu.cpp src\ebsynth_cuda.cu -DNDEBUG -O6 -I "include" -o "bin\ebsynth.dll" -Xcompiler "/openmp /fp:fast" -Xlinker "/IMPLIB:lib\ebsynth.lib" -shared -DEBSYNTH_API=__declspec(dllexport) -w || goto error
+del dummy.lib;dummy.exp 2> NUL
+goto :EOF
+
+:error
+echo FAILED
+@%COMSPEC% /C exit 1 >nul
diff --git a/build-win32-cpu_only.bat b/build-win32-cpu_only.bat
new file mode 100644
index 0000000..1c7cd93
--- /dev/null
+++ b/build-win32-cpu_only.bat
@@ -0,0 +1,14 @@
+@echo off
+setlocal ENABLEDELAYEDEXPANSION
+
+for %%V in (15,14,12,11) do if exist "!VS%%V0COMNTOOLS!" call "!VS%%V0COMNTOOLS!..\..\VC\vcvarsall.bat" x86 && goto compile
+
+:compile
+cl src\ebsynth.cpp src\ebsynth_cpu.cpp src\ebsynth_nocuda.cpp /DNDEBUG /O2 /openmp /EHsc /nologo /I"include" /Fe"bin\ebsynth.exe" || goto error
+cl src\ebsynth.cpp src\ebsynth_cpu.cpp src\ebsynth_nocuda.cpp /DNDEBUG /O2 /openmp /EHsc /nologo /I"include" /Fe"bin\ebsynth.dll" /DEBSYNTH_API="__declspec(dllexport)" /link /IMPLIB:"lib\ebsynth.lib" || goto error
+del ebsynth.obj;ebsynth_cpu.obj;ebsynth_nocuda.obj 2> NUL
+goto :EOF
+
+:error
+echo FAILED
+@%COMSPEC% /C exit 1 >nul
diff --git a/build-win32.bat b/build-win32.bat
deleted file mode 100644
index b2f0db8..0000000
--- a/build-win32.bat
+++ /dev/null
@@ -1,12 +0,0 @@
-@echo off
-setlocal ENABLEDELAYEDEXPANSION
-
-for %%V in (15,14,12,11) do if exist "!VS%%V0COMNTOOLS!" call "!VS%%V0COMNTOOLS!..\..\VC\vcvarsall.bat" x86 && goto compile
-
-:compile
-nvcc -arch compute_30 src\ebsynth.cu -m32 -O6 -w -I "include" -o "bin\ebsynth.exe" -Xcompiler "/DNDEBUG /Ox /Oy /Gy /Oi /fp:fast" -Xlinker "/IMPLIB:\"lib\ebsynth.lib\"" || goto error
-goto :EOF
-
-:error
-echo FAILED
-@%COMSPEC% /C exit 1 >nul
diff --git a/build-win64-cpu+cuda.bat b/build-win64-cpu+cuda.bat
new file mode 100644
index 0000000..7a25549
--- /dev/null
+++ b/build-win64-cpu+cuda.bat
@@ -0,0 +1,14 @@
+@echo off
+setlocal ENABLEDELAYEDEXPANSION
+
+for %%V in (15,14,12,11) do if exist "!VS%%V0COMNTOOLS!" call "!VS%%V0COMNTOOLS!..\..\VC\vcvarsall.bat" amd64 && goto compile
+
+:compile
+nvcc -arch compute_30 src\ebsynth.cpp src\ebsynth_cpu.cpp src\ebsynth_cuda.cu -DNDEBUG -O6 -I "include" -o "bin\ebsynth.exe" -Xcompiler "/openmp /fp:fast" -Xlinker "/IMPLIB:dummy.lib" -w || goto error
+nvcc -arch compute_30 src\ebsynth.cpp src\ebsynth_cpu.cpp src\ebsynth_cuda.cu -DNDEBUG -O6 -I "include" -o "bin\ebsynth.dll" -Xcompiler "/openmp /fp:fast" -Xlinker "/IMPLIB:lib\ebsynth.lib" -shared -DEBSYNTH_API=__declspec(dllexport) -w || goto error
+del dummy.lib;dummy.exp 2> NUL
+goto :EOF
+
+:error
+echo FAILED
+@%COMSPEC% /C exit 1 >nul
diff --git a/build-win64-cpu_only.bat b/build-win64-cpu_only.bat
new file mode 100644
index 0000000..abf2df7
--- /dev/null
+++ b/build-win64-cpu_only.bat
@@ -0,0 +1,14 @@
+@echo off
+setlocal ENABLEDELAYEDEXPANSION
+
+for %%V in (15,14,12,11) do if exist "!VS%%V0COMNTOOLS!" call "!VS%%V0COMNTOOLS!..\..\VC\vcvarsall.bat" amd64 && goto compile
+
+:compile
+cl src\ebsynth.cpp src\ebsynth_cpu.cpp src\ebsynth_nocuda.cpp /DNDEBUG /O2 /openmp /EHsc /nologo /I"include" /Fe"bin\ebsynth.exe" || goto error
+cl src\ebsynth.cpp src\ebsynth_cpu.cpp src\ebsynth_nocuda.cpp /DNDEBUG /O2 /openmp /EHsc /nologo /I"include" /Fe"bin\ebsynth.dll" /DEBSYNTH_API="__declspec(dllexport)" /link /IMPLIB:"lib\ebsynth.lib" || goto error
+del ebsynth.obj;ebsynth_cpu.obj;ebsynth_nocuda.obj 2> NUL
+goto :EOF
+
+:error
+echo FAILED
+@%COMSPEC% /C exit 1 >nul
diff --git a/build-win64.bat b/build-win64.bat
deleted file mode 100644
index 1ab8fe8..0000000
--- a/build-win64.bat
+++ /dev/null
@@ -1,12 +0,0 @@
-@echo off
-setlocal ENABLEDELAYEDEXPANSION
-
-for %%V in (15,14,12,11) do if exist "!VS%%V0COMNTOOLS!" call "!VS%%V0COMNTOOLS!..\..\VC\vcvarsall.bat" amd64 && goto compile
-
-:compile
-nvcc -arch compute_30 src\ebsynth.cu -m64 -O6 -w -I "include" -o "bin\ebsynth.exe" -Xcompiler "/DNDEBUG /Ox /Oy /Gy /Oi /fp:fast" -Xlinker "/IMPLIB:\"lib\ebsynth.lib\"" || goto error
-goto :EOF
-
-:error
-echo FAILED
-@%COMSPEC% /C exit 1 >nul
diff --git a/src/ebsynth.cpp b/src/ebsynth.cpp
new file mode 100644
index 0000000..d6dc2af
--- /dev/null
+++ b/src/ebsynth.cpp
@@ -0,0 +1,551 @@
+// This software is in the public domain. Where that dedication is not
+// recognized, you are granted a perpetual, irrevocable license to copy
+// and modify this file as you see fit.
+
+#include "ebsynth.h"
+#include "ebsynth_cpu.h"
+#include "ebsynth_cuda.h"
+
+#include <cstdio>
+#include <cmath>
+
+EBSYNTH_API
+void ebsynthRun(int    ebsynthBackend,
+                int    numStyleChannels,
+                int    numGuideChannels,
+                int    sourceWidth,
+                int    sourceHeight,
+                void*  sourceStyleData,
+                void*  sourceGuideData,
+                int    targetWidth,
+                int    targetHeight,
+                void*  targetGuideData,
+                void*  targetModulationData,
+                float* styleWeights,
+                float* guideWeights,
+                float  uniformityWeight,
+                int    patchSize,
+                int    voteMode,
+                int    numPyramidLevels,
+                int*   numSearchVoteItersPerLevel,
+                int*   numPatchMatchItersPerLevel,
+                int*   stopThresholdPerLevel,
+                void*  outputNnfData,
+                void*  outputImageData)
+{
+  void (*backendDispatch)(int,int,int,int,void*,void*,int,int,void*,void*,float*,float*,float,int,int,int,int*,int*,int*,void*,void*) = 0;
+  
+  if      (ebsynthBackend==EBSYNTH_BACKEND_CPU ) { backendDispatch = ebsynthRunCpu;  }
+  else if (ebsynthBackend==EBSYNTH_BACKEND_CUDA) { backendDispatch = ebsynthRunCuda; }
+  else if (ebsynthBackend==EBSYNTH_BACKEND_AUTO) { backendDispatch = ebsynthBackendAvailableCuda() ? ebsynthRunCuda : ebsynthRunCpu; }
+  
+  if (backendDispatch!=0)
+  {
+    backendDispatch(numStyleChannels,
+                    numGuideChannels,
+                    sourceWidth,
+                    sourceHeight,
+                    sourceStyleData,
+                    sourceGuideData,
+                    targetWidth,
+                    targetHeight,
+                    targetGuideData,
+                    targetModulationData,
+                    styleWeights,
+                    guideWeights,
+                    uniformityWeight,
+                    patchSize,
+                    voteMode,
+                    numPyramidLevels,
+                    numSearchVoteItersPerLevel,
+                    numPatchMatchItersPerLevel,
+                    stopThresholdPerLevel,
+                    outputNnfData,
+                    outputImageData);
+  }
+}
+
+EBSYNTH_API
+int ebsynthBackendAvailable(int ebsynthBackend)
+{
+  if      (ebsynthBackend==EBSYNTH_BACKEND_CPU ) { return ebsynthBackendAvailableCpu();  }
+  else if (ebsynthBackend==EBSYNTH_BACKEND_CUDA) { return ebsynthBackendAvailableCuda(); }
+  else if (ebsynthBackend==EBSYNTH_BACKEND_AUTO) { return ebsynthBackendAvailableCpu() || ebsynthBackendAvailableCuda(); }
+  
+  return 0;
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <cstdio>
+#include <cmath>
+
+#include <vector>
+#include <string>
+#include <algorithm>
+
+#include "jzq.h"
+
+template<typename FUNC>
+bool tryToParseArg(const std::vector<std::string>& args,int* inout_argi,const char* name,bool* out_fail,FUNC handler)
+{
+  int& argi = *inout_argi;
+  bool& fail = *out_fail;
+
+  if (argi<0 || argi>=args.size()) { fail = true; return false; }
+
+  if (args[argi]==name)
+  {
+    argi++;
+    fail = !handler();    
+    return true;
+  }
+
+  fail = false; return false; 
+}
+
+bool tryToParseIntArg(const std::vector<std::string>& args,int* inout_argi,const char* name,int* out_value,bool* out_fail)
+{
+  return tryToParseArg(args,inout_argi,name,out_fail,[&]
+  {
+    int& argi = *inout_argi;
+    if (argi<args.size())
+    {
+      const std::string& arg = args[argi];
+      try
+      {
+        std::size_t pos = 0;
+        *out_value = std::stoi(arg,&pos);
+        if (pos!=arg.size()) { printf("error: bad %s argument '%s'\n",name,arg.c_str()); return false; }
+        return true;
+      }
+      catch(...)
+      {
+        printf("error: bad %s argument '%s'\n",name,arg.c_str());
+        return false;
+      }   
+    }
+    printf("error: missing argument for the %s option\n",name);
+    return false;
+  });
+}
+
+bool tryToParseFloatArg(const std::vector<std::string>& args,int* inout_argi,const char* name,float* out_value,bool* out_fail)
+{
+  return tryToParseArg(args,inout_argi,name,out_fail,[&]
+  {
+    int& argi = *inout_argi;
+    if (argi<args.size())
+    {
+      const std::string& arg = args[argi];
+      try
+      {
+        std::size_t pos = 0;
+        *out_value = std::stof(arg,&pos);
+        if (pos!=arg.size()) { printf("error: bad %s argument '%s'\n",name,arg.c_str()); return false; }
+        return true;
+      }
+      catch(...)
+      {
+        printf("error: bad %s argument '%s'\n",name,args[argi].c_str());
+        return false;
+      }   
+    }
+    printf("error: missing argument for the %s option\n",name);
+    return false;
+  });
+}
+
+bool tryToParseStringArg(const std::vector<std::string>& args,int* inout_argi,const char* name,std::string* out_value,bool* out_fail)
+{
+  return tryToParseArg(args,inout_argi,name,out_fail,[&]
+  {
+    int& argi = *inout_argi;
+    if (argi<args.size())
+    {
+      *out_value = args[argi];
+      return true;
+    }
+    printf("error: missing argument for the %s option\n",name);
+    return false;
+  });
+}
+
+bool tryToParseStringPairArg(const std::vector<std::string>& args,int* inout_argi,const char* name,std::pair<std::string,std::string>* out_value,bool* out_fail)
+{
+  return tryToParseArg(args,inout_argi,name,out_fail,[&]
+  {
+    int& argi = *inout_argi;
+    if ((argi+1)<args.size())
+    {
+      *out_value = std::make_pair(args[argi],args[argi+1]);
+      argi++;
+      return true;
+    }
+    printf("error: missing argument for the %s option\n",name);
+    return false;
+  });
+}
+
+#define STB_IMAGE_IMPLEMENTATION
+#include "stb_image.h"
+
+#define STB_IMAGE_WRITE_IMPLEMENTATION
+#include "stb_image_write.h"
+
+unsigned char* tryLoad(const std::string& fileName,int* width,int* height)
+{
+  unsigned char* data = stbi_load(fileName.c_str(),width,height,NULL,4);
+  if (data==NULL)
+  {
+    printf("error: failed to load '%s'\n",fileName.c_str());
+    printf("%s\n",stbi_failure_reason());
+    exit(1);
+  }
+  return data;
+}
+
+int evalNumChannels(const unsigned char* data,const int numPixels)
+{
+  bool isGray = true;
+  bool hasAlpha = false;
+
+  for(int xy=0;xy<numPixels;xy++)
+  {
+    const unsigned char r = data[xy*4+0];
+    const unsigned char g = data[xy*4+1];
+    const unsigned char b = data[xy*4+2];
+    const unsigned char a = data[xy*4+3];
+
+    if (!(r==g && g==b)) { isGray  = false; }
+    if (a<255)           { hasAlpha = true; }
+  }
+
+  const int numChannels = (isGray ? 1 : 3) + (hasAlpha ? 1 : 0);
+
+  return numChannels;
+}
+
+V2i pyramidLevelSize(const V2i& sizeBase,const int level)
+{
+  return V2i(V2f(sizeBase)*std::pow(2.0f,-float(level)));
+}
+
+std::string backendToString(const int ebsynthBackend)
+{
+  if      (ebsynthBackend==EBSYNTH_BACKEND_CPU)  { return "cpu";  }
+  else if (ebsynthBackend==EBSYNTH_BACKEND_CUDA) { return "cuda"; }
+  else if (ebsynthBackend==EBSYNTH_BACKEND_AUTO) { return "auto"; }
+  return "unknown";
+}
+
+int main(int argc,char** argv)
+{
+  if (argc<2)
+  {
+    printf("usage: %s [options]\n",argv[0]);
+    printf("\n");
+    printf("options:\n");
+    printf("  -style <style.png>\n");
+    printf("  -guide <source.png> <target.png>\n");
+    printf("  -output <output.png>\n");
+    printf("  -weight <value>\n");
+    printf("  -uniformity <value>\n");
+    printf("  -patchsize <size>\n");
+    printf("  -pyramidlevels <number>\n");
+    printf("  -searchvoteiters <number>\n");
+    printf("  -patchmatchiters <number>\n");
+    printf("  -stopthreshold <value>\n");
+    printf("  -backend [cpu|cuda]\n");
+    printf("\n");
+    return 1;
+  }
+
+  std::string styleFileName;
+  float       styleWeight = NAN;
+  std::string outputFileName = "output.png";
+
+  struct Guide
+  {
+    std::string    sourceFileName;
+    std::string    targetFileName;
+    float          weight;
+
+    int            sourceWidth;
+    int            sourceHeight;
+    unsigned char* sourceData;
+
+    int            targetWidth;
+    int            targetHeight;
+    unsigned char* targetData;
+    
+    int            numChannels;
+  };
+
+  std::vector<Guide> guides;
+
+  float uniformityWeight = 3500;
+  int patchSize = 5; 
+  int numPyramidLevels = -1;
+  int numSearchVoteIters = 6;
+  int numPatchMatchIters = 4;
+  int stopThreshold = 5;
+  int backend = ebsynthBackendAvailable(EBSYNTH_BACKEND_CUDA) ? EBSYNTH_BACKEND_CUDA : EBSYNTH_BACKEND_CPU;
+
+  {
+    std::vector<std::string> args(argc);
+    for(int i=0;i<argc;i++) { args[i] = argv[i]; }
+  
+    bool fail = false;
+    int argi = 1;   
+
+    float* precedingStyleOrGuideWeight = 0;
+    while(argi<argc && !fail)
+    {
+      float weight;
+      std::pair<std::string,std::string> guidePair;
+      std::string backendName;
+
+      if      (tryToParseStringArg(args,&argi,"-style",&styleFileName,&fail))
+      {
+        styleWeight = NAN;
+        precedingStyleOrGuideWeight = &styleWeight;
+        argi++;
+      }
+      else if (tryToParseStringPairArg(args,&argi,"-guide",&guidePair,&fail))
+      {
+        Guide guide;
+        guide.sourceFileName = guidePair.first;
+        guide.targetFileName = guidePair.second;
+        guide.weight = NAN;
+        guides.push_back(guide);
+        precedingStyleOrGuideWeight = &guides[guides.size()-1].weight;
+        argi++;
+      }
+      else if (tryToParseStringArg(args,&argi,"-output",&outputFileName,&fail))
+      {
+        argi++;
+      }
+      else if (tryToParseFloatArg(args,&argi,"-weight",&weight,&fail))
+      {
+        if (precedingStyleOrGuideWeight!=0) { *precedingStyleOrGuideWeight = weight; }
+        else { printf("error: at least one -style or -guide option must precede the -weight option!\n"); return 1; }
+        argi++;
+      }
+      else if (tryToParseFloatArg(args,&argi,"-uniformity",&uniformityWeight,&fail)) { argi++; }
+      else if (tryToParseIntArg(args,&argi,"-patchsize",&patchSize,&fail))
+      {
+        if (patchSize<3)    { printf("error: patchsize is too small!\n"); return 1; }
+        if (patchSize%2==0) { printf("error: patchsize must be an odd number!\n"); return 1; }
+        argi++;
+      }
+      else if (tryToParseIntArg(args,&argi,"-pyramidlevels",&numPyramidLevels,&fail))
+      {
+        if (numPyramidLevels<1) { printf("error: bad argument for -pyramidlevels!\n"); return 1; }
+        argi++;
+      }
+      else if (tryToParseIntArg(args,&argi,"-searchvoteiters",&numSearchVoteIters,&fail))
+      {
+        if (numSearchVoteIters<0) { printf("error: bad argument for -searchvoteiters!\n"); return 1; }
+        argi++;
+      }
+      else if (tryToParseIntArg(args,&argi,"-patchmatchiters",&numPatchMatchIters,&fail))
+      {
+        if (numPatchMatchIters<0) { printf("error: bad argument for -patchmatchiters!\n"); return 1; }
+        argi++;
+      }
+      else if (tryToParseIntArg(args,&argi,"-stopthreshold",&stopThreshold,&fail))
+      {
+        if (stopThreshold<0) { printf("error: bad argument for -stopthreshold!\n"); return 1; }
+        argi++;
+      }
+      else if (tryToParseStringArg(args,&argi,"-backend",&backendName,&fail))
+      {
+        if      (backendName=="cpu" ) { backend = EBSYNTH_BACKEND_CPU; }
+        else if (backendName=="cuda") { backend = EBSYNTH_BACKEND_CUDA; }
+        else { printf("error: unrecognized backend '%s'\n",backendName.c_str()); return 1; }
+
+        if (!ebsynthBackendAvailable(backend)) { printf("error: the %s backend is not available!\n",backendToString(backend).c_str()); return 1; }
+
+        argi++;
+      }
+      else
+      {
+        printf("error: unrecognized option '%s'\n",args[argi].c_str());
+        fail = true;
+      }
+
+    }
+    
+    if (fail) { return 1; }
+  }
+
+  const int numGuides = guides.size();
+
+  int sourceWidth = 0;
+  int sourceHeight = 0;
+  unsigned char* sourceStyleData = tryLoad(styleFileName,&sourceWidth,&sourceHeight);
+  const int numStyleChannelsTotal = evalNumChannels(sourceStyleData,sourceWidth*sourceHeight);
+
+  std::vector<unsigned char> sourceStyle(sourceWidth*sourceHeight*numStyleChannelsTotal);
+  for(int xy=0;xy<sourceWidth*sourceHeight;xy++)
+  {
+    if      (numStyleChannelsTotal>0)  { sourceStyle[xy*numStyleChannelsTotal+0] = sourceStyleData[xy*4+0]; }
+    if      (numStyleChannelsTotal==2) { sourceStyle[xy*numStyleChannelsTotal+1] = sourceStyleData[xy*4+3]; }           
+    else if (numStyleChannelsTotal>1)  { sourceStyle[xy*numStyleChannelsTotal+1] = sourceStyleData[xy*4+1]; }
+    if      (numStyleChannelsTotal>2)  { sourceStyle[xy*numStyleChannelsTotal+2] = sourceStyleData[xy*4+2]; }
+    if      (numStyleChannelsTotal>3)  { sourceStyle[xy*numStyleChannelsTotal+3] = sourceStyleData[xy*4+3]; }                 
+  }
+  
+  int targetWidth = 0;
+  int targetHeight = 0;
+  int numGuideChannelsTotal = 0;
+
+  for(int i=0;i<numGuides;i++)
+  {
+    Guide& guide = guides[i];
+
+    guide.sourceData = tryLoad(guide.sourceFileName,&guide.sourceWidth,&guide.sourceHeight);
+    guide.targetData = tryLoad(guide.targetFileName,&guide.targetWidth,&guide.targetHeight);
+      
+    if              (guide.sourceWidth!=sourceWidth || guide.sourceHeight!=sourceHeight)  { printf("error: source guide '%s' doesn't match the resolution of '%s'\n",guide.sourceFileName.c_str(),styleFileName.c_str()); return 1; }      
+    if      (i>0 && (guide.targetWidth!=targetWidth || guide.targetHeight!=targetHeight)) { printf("error: target guide '%s' doesn't match the resolution of '%s'\n",guide.targetFileName.c_str(),guides[0].targetFileName.c_str()); return 1; }
+    else if (i==0) { targetWidth = guide.targetWidth; targetHeight = guide.targetHeight; }
+
+    guide.numChannels = std::max(evalNumChannels(guide.sourceData,sourceWidth*sourceHeight),
+                                 evalNumChannels(guide.targetData,targetWidth*targetHeight));    
+  
+    numGuideChannelsTotal += guide.numChannels;
+  }
+  
+  if (numStyleChannelsTotal>EBSYNTH_MAX_STYLE_CHANNELS) { printf("error: too many style channels (%d), maximum number is %d\n",numStyleChannelsTotal,EBSYNTH_MAX_STYLE_CHANNELS); return 1; }
+  if (numGuideChannelsTotal>EBSYNTH_MAX_GUIDE_CHANNELS) { printf("error: too many guide channels (%d), maximum number is %d\n",numGuideChannelsTotal,EBSYNTH_MAX_GUIDE_CHANNELS); return 1; }
+
+  std::vector<unsigned char> sourceGuides(sourceWidth*sourceHeight*numGuideChannelsTotal);
+  for(int xy=0;xy<sourceWidth*sourceHeight;xy++)
+  {
+    int c = 0;
+    for(int i=0;i<numGuides;i++)
+    { 
+      const int numChannels = guides[i].numChannels;  
+
+      if      (numChannels>0)  { sourceGuides[xy*numGuideChannelsTotal+c+0] = guides[i].sourceData[xy*4+0]; }
+      if      (numChannels==2) { sourceGuides[xy*numGuideChannelsTotal+c+1] = guides[i].sourceData[xy*4+3]; }           
+      else if (numChannels>1)  { sourceGuides[xy*numGuideChannelsTotal+c+1] = guides[i].sourceData[xy*4+1]; }
+      if      (numChannels>2)  { sourceGuides[xy*numGuideChannelsTotal+c+2] = guides[i].sourceData[xy*4+2]; }
+      if      (numChannels>3)  { sourceGuides[xy*numGuideChannelsTotal+c+3] = guides[i].sourceData[xy*4+3]; }            
+      
+      c += numChannels;
+    }
+  }
+
+  std::vector<unsigned char> targetGuides(targetWidth*targetHeight*numGuideChannelsTotal);
+  for(int xy=0;xy<targetWidth*targetHeight;xy++)
+  {
+    int c = 0;
+    for(int i=0;i<numGuides;i++)
+    { 
+      const int numChannels = guides[i].numChannels;  
+
+      if      (numChannels>0)  { targetGuides[xy*numGuideChannelsTotal+c+0] = guides[i].targetData[xy*4+0]; }
+      if      (numChannels==2) { targetGuides[xy*numGuideChannelsTotal+c+1] = guides[i].targetData[xy*4+3]; }           
+      else if (numChannels>1)  { targetGuides[xy*numGuideChannelsTotal+c+1] = guides[i].targetData[xy*4+1]; }
+      if      (numChannels>2)  { targetGuides[xy*numGuideChannelsTotal+c+2] = guides[i].targetData[xy*4+2]; }
+      if      (numChannels>3)  { targetGuides[xy*numGuideChannelsTotal+c+3] = guides[i].targetData[xy*4+3]; }            
+      
+      c += numChannels;
+    }
+  }
+
+  std::vector<float> styleWeights(numStyleChannelsTotal);
+  if (isnan(styleWeight)) { styleWeight = 1.0f; }
+  for(int i=0;i<numStyleChannelsTotal;i++) { styleWeights[i] = styleWeight / float(numStyleChannelsTotal); }
+
+  for(int i=0;i<numGuides;i++) { if (isnan(guides[i].weight)) { guides[i].weight = 1.0f/float(numGuides); } }
+
+  std::vector<float> guideWeights(numGuideChannelsTotal);
+  {
+    int c = 0;
+    for(int i=0;i<numGuides;i++)
+    { 
+      const int numChannels = guides[i].numChannels;  
+      
+      for(int j=0;j<numChannels;j++)
+      {
+        guideWeights[c+j] = guides[i].weight / float(numChannels);
+      }
+
+      c += numChannels; 
+    }
+  }
+
+  int maxPyramidLevels = 0;
+  for(int level=32;level>=0;level--)
+  {
+    if (min(pyramidLevelSize(std::min(V2i(sourceWidth,sourceHeight),V2i(targetWidth,targetHeight)),level)) >= (2*patchSize+1))
+    {
+      maxPyramidLevels = level+1;
+      break;
+    }
+  }
+
+  if (numPyramidLevels==-1) { numPyramidLevels = maxPyramidLevels; }
+  numPyramidLevels = std::min(numPyramidLevels,maxPyramidLevels); 
+
+  std::vector<int> numSearchVoteItersPerLevel(numPyramidLevels);
+  std::vector<int> numPatchMatchItersPerLevel(numPyramidLevels);
+  std::vector<int> stopThresholdPerLevel(numPyramidLevels);
+  for(int i=0;i<numPyramidLevels;i++)
+  {
+    numSearchVoteItersPerLevel[i] = numSearchVoteIters;
+    numPatchMatchItersPerLevel[i] = numPatchMatchIters;
+    stopThresholdPerLevel[i] = stopThreshold;
+  }
+
+  std::vector<unsigned char> output(targetWidth*targetHeight*numStyleChannelsTotal);
+
+  printf("uniformity: %.0f\n",uniformityWeight);
+  printf("patchsize: %d\n",patchSize);
+  printf("pyramidlevels: %d\n",numPyramidLevels);
+  printf("searchvoteiters: %d\n",numSearchVoteIters);
+  printf("patchmatchiters: %d\n",numPatchMatchIters);
+  printf("stopthreshold: %d\n",stopThreshold);
+  printf("backend: %s\n",backendToString(backend).c_str());
+
+  ebsynthRun(backend,
+             numStyleChannelsTotal,
+             numGuideChannelsTotal,
+             sourceWidth,
+             sourceHeight,
+             sourceStyle.data(),
+             sourceGuides.data(),
+             targetWidth,
+             targetHeight,
+             targetGuides.data(),
+             NULL,
+             styleWeights.data(),
+             guideWeights.data(),
+             uniformityWeight,
+             patchSize,
+             EBSYNTH_VOTEMODE_PLAIN,
+             numPyramidLevels,
+             numSearchVoteItersPerLevel.data(),
+             numPatchMatchItersPerLevel.data(),
+             stopThresholdPerLevel.data(),
+             NULL,
+             output.data());
+
+  stbi_write_png(outputFileName.c_str(),targetWidth,targetHeight,numStyleChannelsTotal,output.data(),numStyleChannelsTotal*targetWidth);
+
+  printf("result was written to %s\n",outputFileName.c_str());
+
+  stbi_image_free(sourceStyleData);
+
+  for(int i=0;i<numGuides;i++)
+  {
+    stbi_image_free(guides[i].sourceData);
+    stbi_image_free(guides[i].targetData);
+  }
+  
+  return 0;
+}
diff --git a/src/ebsynth_cpu.cpp b/src/ebsynth_cpu.cpp
new file mode 100644
index 0000000..5fd89cc
--- /dev/null
+++ b/src/ebsynth_cpu.cpp
@@ -0,0 +1,1037 @@
+// This software is in the public domain. Where that dedication is not
+// recognized, you are granted a perpetual, irrevocable license to copy
+// and modify this file as you see fit.
+
+#include "ebsynth.h"
+#include "jzq.h"
+#include "omp.h"
+
+#include <cmath>
+#include <cfloat>
+#include <cstring>
+
+#define FOR(A,X,Y) for(int Y=0;Y<A.height();Y++) for(int X=0;X<A.width();X++)
+
+A2V2i nnfInit(const V2i& sizeA,
+              const V2i& sizeB,
+              const int  patchWidth)
+{
+  A2V2i NNF(sizeA);
+
+  for(int xy=0;xy<NNF.numel();xy++)
+  {
+    NNF[xy] = V2i(patchWidth+rand()%(sizeB(0)-2*patchWidth),
+                  patchWidth+rand()%(sizeB(1)-2*patchWidth));
+  }
+
+  return NNF;
+}
+
+template<typename FUNC>
+A2f nnfError(const A2V2i& NNF,
+             const int    patchWidth,
+             FUNC         patchError)
+{
+  A2f E(size(NNF));
+  
+  #pragma omp parallel for schedule(static)
+  for(int y=0;y<NNF.height();y++)
+  for(int x=0;x<NNF.width();x++)
+  {
+    E(x,y) = patchError(patchWidth,V2i(x,y),NNF(x,y),FLT_MAX);
+  }
+  
+  return E;
+}
+
+static A2V2i nnfInitRandom(const V2i& targetSize,
+                    const V2i& sourceSize,
+                    const int  patchSize)
+{
+  A2V2i NNF(targetSize);
+  const int r = patchSize/2;
+
+  for (int i = 0; i < NNF.numel(); i++)
+  {
+      NNF[i] = V2i
+      (
+          r+(rand()%(sourceSize[0]-2*r)),
+          r+(rand()%(sourceSize[1]-2*r))
+      );
+  }
+
+  return NNF;
+}
+
+static A2V2i nnfUpscale(const A2V2i& NNF,
+                 const int    patchSize,
+                 const V2i&   targetSize,
+                 const V2i&   sourceSize)
+{
+  A2V2i NNF2x(targetSize);
+
+  FOR(NNF2x,x,y)
+  {
+    NNF2x(x,y) = NNF(clamp(x/2,0,NNF.width()-1),
+                     clamp(y/2,0,NNF.height()-1))*2+V2i(x%2,y%2);
+  }
+
+  FOR(NNF2x,x,y)
+  {
+    const V2i nn = NNF2x(x,y);
+
+    NNF2x(x,y) = V2i(clamp(nn(0),patchSize,sourceSize(0)-patchSize-1),
+                     clamp(nn(1),patchSize,sourceSize(1)-patchSize-1));
+  }
+
+  return NNF2x;
+}
+
+template<int N,typename T>
+void krnlVotePlain(      Array2<Vec<N,T>>&   target,
+                   const Array2<Vec<N,T>>&   source,
+                   const Array2<Vec<2,int>>& NNF,
+                   const int                 patchSize)
+{
+  for(int y=0;y<target.height();y++)
+  for(int x=0;x<target.width();x++)
+  {
+    const int r = patchSize / 2;
+
+    Vec<N,float> sumColor = zero<Vec<N,float>>::value();
+    float sumWeight = 0;
+
+    for (int py = -r; py <= +r; py++)
+    for (int px = -r; px <= +r; px++)
+    {
+      if
+      (
+        x+px >= 0 && x+px < NNF.width () &&
+        y+py >= 0 && y+py < NNF.height()
+      )
+      {
+        const V2i n = NNF(x+px,y+py)-V2i(px,py);
+
+        if
+        (
+          n[0] >= 0 && n[0] < source.width () &&
+          n[1] >= 0 && n[1] < source.height()
+        )
+        {
+          const float weight = 1.0f;
+          sumColor += weight*Vec<N,float>(source(n(0),n(1)));
+          sumWeight += weight;
+        }
+      }
+    }
+
+    const Vec<N,T> v = Vec<N,T>(sumColor/sumWeight);
+    target(x,y) = v;
+  }
+}
+
+#if 0
+template<int N, typename T, int M>
+__global__ void krnlVoteWeighted(      TexArray2<N,T,M>   target,
+                                 const TexArray2<N,T,M>   source,
+                                 const TexArray2<2,int>   NNF,
+                                 const TexArray2<1,float> E,
+                                 const int patchSize)
+{
+  const int x = blockDim.x*blockIdx.x + threadIdx.x;
+  const int y = blockDim.y*blockIdx.y + threadIdx.y;
+
+  if (x<target.width && y<target.height)
+  {
+    const int r = patchSize / 2;
+
+    Vec<N,float> sumColor = zero<Vec<N,float>>::value();
+    float sumWeight = 0;
+
+    for (int py = -r; py <= +r; py++)
+    for (int px = -r; px <= +r; px++)
+    {
+      /*
+      if
+      (
+        x+px >= 0 && x+px < NNF.width () &&
+        y+py >= 0 && y+py < NNF.height()
+      )
+      */
+      {
+        const V2i n = NNF(x+px,y+py)-V2i(px,py);
+
+        /*if
+        (
+          n[0] >= 0 && n[0] < S.width () &&
+          n[1] >= 0 && n[1] < S.height()
+        )*/
+        {
+          const float error = E(x+px,y+py)(0)/(patchSize*patchSize*N);
+          const float weight = 1.0f/(1.0f+error);
+          sumColor += weight*Vec<N,float>(source(n(0),n(1)));
+          sumWeight += weight;
+        }
+      }
+    }
+
+    const Vec<N,T> v = Vec<N,T>(sumColor/sumWeight);
+    target.write(x,y,v);
+  }
+}
+#endif
+
+template<int N,typename T>
+Vec<N,T> sampleBilinear(const Array2<Vec<N,T>>& I,float x,float y)
+{
+  const int ix = x;
+  const int iy = y;
+
+  const float s = x-ix;
+  const float t = y-iy;
+
+  return Vec<N,T>((1.0f-s)*(1.0f-t)*Vec<N,float>(I(clamp(ix  ,0,I.width()-1),clamp(iy  ,0,I.height()-1)))+
+                  (     s)*(1.0f-t)*Vec<N,float>(I(clamp(ix+1,0,I.width()-1),clamp(iy  ,0,I.height()-1)))+
+                  (1.0f-s)*(     t)*Vec<N,float>(I(clamp(ix  ,0,I.width()-1),clamp(iy+1,0,I.height()-1)))+
+                  (     s)*(     t)*Vec<N,float>(I(clamp(ix+1,0,I.width()-1),clamp(iy+1,0,I.height()-1))));
+};
+
+/*
+template<int N, typename T, int M>
+__global__ void krnlEvalMask(      TexArray2<1,unsigned char> mask,
+                             const TexArray2<N,T,M> style,
+                             const TexArray2<N,T,M> style2,
+                             const int stopThreshold)
+{
+  const int x = blockDim.x*blockIdx.x + threadIdx.x;
+  const int y = blockDim.y*blockIdx.y + threadIdx.y;
+
+  if (x<mask.width && y<mask.height)
+  {
+    const Vec<N,T> s  = style(x,y);
+    const Vec<N,T> s2 = style2(x,y);
+
+    int maxDiff = 0;
+    for(int c=0;c<N;c++)
+    {
+      const int diff = std::abs(int(s[c])-int(s2[c]));
+      maxDiff = diff>maxDiff ? diff:maxDiff;
+    }
+
+    const Vec<1,unsigned char> msk = maxDiff < stopThreshold ? Vec<1,unsigned char>(0) : Vec<1,unsigned char>(255);
+
+    mask.write(x,y,msk);
+  }
+}
+
+__global__ void krnlDilateMask(TexArray2<1,unsigned char> mask2,
+                               const TexArray2<1,unsigned char> mask,
+                               const int patchSize)
+{
+  const int x = blockDim.x*blockIdx.x + threadIdx.x;
+  const int y = blockDim.y*blockIdx.y + threadIdx.y;
+
+  if (x<mask.width && y<mask.height)
+  {
+    const int r = patchSize / 2;
+
+    Vec<1,unsigned char> msk = Vec<1,unsigned char>(0);
+
+    for (int py = -r; py <= +r; py++)
+    for (int px = -r; px <= +r; px++)
+    {
+      if (mask(x+px,y+py)[0]==255) { msk = Vec<1,unsigned char>(255); }
+    }
+
+    mask2.write(x,y,msk);
+  }
+}
+*/
+
+template<int N,typename T>
+void resampleCPU(      Array2<Vec<N,T>>& O,
+                 const Array2<Vec<N,T>>& I)
+{
+  const float s = float(I.width())/float(O.width());
+  
+  for(int y=0;y<O.height();y++)
+  for(int x=0;x<O.width();x++)
+  {
+    O(x,y) = sampleBilinear(I,s*float(x),s*float(y));
+  }
+}
+
+template<int NS,int NG,typename T>
+struct PatchSSD_Split
+{
+  const Array2<Vec<NS,T>>& targetStyle;
+  const Array2<Vec<NS,T>>& sourceStyle;
+
+  const Array2<Vec<NG,T>>& targetGuide;
+  const Array2<Vec<NG,T>>& sourceGuide;
+
+  const Vec<NS,float>& styleWeights;
+  const Vec<NG,float>& guideWeights;
+
+  PatchSSD_Split(const Array2<Vec<NS,T>>& targetStyle,
+                 const Array2<Vec<NS,T>>& sourceStyle,
+
+                 const Array2<Vec<NG,T>>& targetGuide,
+                 const Array2<Vec<NG,T>>& sourceGuide,
+
+                 const Vec<NS,float>& styleWeights,
+                 const Vec<NG,float>& guideWeights)
+
+  : targetStyle(targetStyle),sourceStyle(sourceStyle),
+    targetGuide(targetGuide),sourceGuide(sourceGuide),
+    styleWeights(styleWeights),guideWeights(guideWeights) {}
+
+  float operator()(const int   patchSize,           
+                   const V2i   txy,
+                   const V2i   sxy,
+                   const float ebest)
+  {
+    const int tx = txy(0);
+    const int ty = txy(1);
+    const int sx = sxy(0);
+    const int sy = sxy(1);
+
+    const int r = patchSize/2;
+    float error = 0;
+  
+    if(tx-r>=0 && tx+r<targetStyle.width() &&
+       ty-r>=0 && ty+r<targetStyle.height())
+    {
+      const T* ptrTs = (T*)&targetStyle(tx-r,ty-r);
+      const T* ptrSs = (T*)&sourceStyle(sx-r,sy-r);
+      const T* ptrTg = (T*)&targetGuide(tx-r,ty-r);
+      const T* ptrSg = (T*)&sourceGuide(sx-r,sy-r);
+      const int ofsTs = (targetStyle.width()-patchSize)*NS;
+      const int ofsSs = (sourceStyle.width()-patchSize)*NS;
+      const int ofsTg = (targetGuide.width()-patchSize)*NG;
+      const int ofsSg = (sourceGuide.width()-patchSize)*NG;
+      for(int j=0;j<patchSize;j++)
+      {
+        for(int i=0;i<patchSize;i++)
+        {
+          for(int k=0;k<NS;k++)
+          {
+            const float diff = *ptrTs - *ptrSs;
+            error += styleWeights[k]*diff*diff;
+            ptrTs++;
+            ptrSs++;
+          }
+          for(int k=0;k<NG;k++)
+          {
+            const float diff = *ptrTg - *ptrSg;
+            error += guideWeights[k]*diff*diff;
+            ptrTg++;
+            ptrSg++;
+          }
+        }        
+        ptrTs += ofsTs;
+        ptrSs += ofsSs;
+        ptrTg += ofsTg;
+        ptrSg += ofsSg;        
+        if(error>ebest) { break; }
+      }
+    }
+    else
+    {
+      for(int py=-r;py<=+r;py++)
+      for(int px=-r;px<=+r;px++)
+      {
+        {
+          const Vec<NS,T> pixTs = targetStyle(clamp(tx + px,0,targetStyle.width()-1),clamp(ty + py,0,targetStyle.height()-1));
+          const Vec<NS,T> pixSs = sourceStyle(clamp(sx + px,0,sourceStyle.width()-1),clamp(sy + py,0,sourceStyle.height()-1));
+          for(int i=0;i<NS;i++)
+          {
+            const float diff = float(pixTs[i]) - float(pixSs[i]);
+            error += styleWeights[i]*diff*diff;
+          }
+        }
+
+        {
+          const Vec<NG,T> pixTg = targetGuide(clamp(tx + px,0,targetGuide.width()-1),clamp(ty + py,0,targetGuide.height()-1));
+          const Vec<NG,T> pixSg = sourceGuide(clamp(sx + px,0,sourceGuide.width()-1),clamp(sy + py,0,sourceGuide.height()-1));
+          for(int i=0;i<NG;i++)
+          {
+            const float diff = float(pixTg[i]) - float(pixSg[i]);
+            error += guideWeights[i]*diff*diff;
+          }
+        }
+      }
+    }
+
+    return error;
+  }
+};
+
+/*
+template<int NS,int NG,typename T>
+struct PatchSSD_Split_Modulation
+{
+  const TexArray2<NS,T> targetStyle;
+  const TexArray2<NS,T> sourceStyle;
+
+  const TexArray2<NG,T> targetGuide;
+  const TexArray2<NG,T> sourceGuide;
+
+  const TexArray2<NG,T> targetModulation;
+
+  const Vec<NS,float> styleWeights;
+  const Vec<NG,float> guideWeights;
+
+  PatchSSD_Split_Modulation(const TexArray2<NS,T>& targetStyle,
+                            const TexArray2<NS,T>& sourceStyle,
+
+                            const TexArray2<NG,T>& targetGuide,
+                            const TexArray2<NG,T>& sourceGuide,
+
+                            const TexArray2<NG,T>& targetModulation,
+
+                            const Vec<NS,float>&   styleWeights,
+                            const Vec<NG,float>&   guideWeights)
+
+  : targetStyle(targetStyle),sourceStyle(sourceStyle),
+    targetGuide(targetGuide),sourceGuide(sourceGuide),
+    targetModulation(targetModulation),
+    styleWeights(styleWeights),guideWeights(guideWeights) {}
+
+   __device__ float operator()(const int   patchSize,
+                               const int   tx,
+                               const int   ty,
+                               const int   sx,
+                               const int   sy,
+                               const float ebest)
+  {
+    const int r = patchSize/2;
+    float error = 0;
+
+    for(int py=-r;py<=+r;py++)
+    {
+      for(int px=-r;px<=+r;px++)
+      {
+        {
+          const Vec<NS,T> pixTs = targetStyle(tx + px,ty + py);
+          const Vec<NS,T> pixSs = sourceStyle(sx + px,sy + py);
+          for(int i=0;i<NS;i++)
+          {
+            const float diff = float(pixTs[i]) - float(pixSs[i]);
+            error += styleWeights[i]*diff*diff;
+          }
+        }
+
+        {
+          const Vec<NG,T> pixTg = targetGuide(tx + px,ty + py);
+          const Vec<NG,T> pixSg = sourceGuide(sx + px,sy + py);
+          const Vec<NG,float> mult = Vec<NG,float>(targetModulation(tx,ty))/255.0f;
+
+          for(int i=0;i<NG;i++)
+          {
+            const float diff = float(pixTg[i]) - float(pixSg[i]);
+            error += guideWeights[i]*mult[i]*diff*diff;
+          }
+        }
+      }
+
+      if (error>ebest) { return error; }
+    }
+
+    return error;
+  }
+};
+*/
+
+static V2i pyramidLevelSize(const V2i& sizeBase,const int numLevels,const int level)
+{
+  return V2i(V2f(sizeBase)*std::pow(2.0f,-float(numLevels-1-level)));
+}
+
+template<typename T>
+void copy(Array2<T>* out_dst,void* src)
+{
+  Array2<T>& dst = *out_dst;
+  memcpy(dst.data(),src,numel(dst)*sizeof(T));
+}
+
+template<typename T>
+void copy(void** out_dst,const Array2<T>& src)
+{
+  void*& dst = *out_dst;
+  memcpy(dst,src.data(),numel(src)*sizeof(T));
+}
+
+void updateOmega(A2i& Omega,const V2i& sizeA,const int patchWidth,const V2i& axy,const V2i& bxy,const int incdec)
+{
+  const int r = patchWidth/2;
+  
+  int* ptr = (int*)&Omega(bxy(0)-r,bxy(1)-r);
+  const int ofs = (Omega.width()-patchWidth);
+
+  for(int j=0;j<patchWidth;j++)
+  {
+    for(int i=0;i<patchWidth;i++)
+    {
+      *ptr += incdec;
+      ptr++;
+    }
+    ptr += ofs;
+  }
+}
+
+static int patchOmega(const int patchWidth,const V2i& bxy,const A2i& Omega)
+{
+  const int r = patchWidth/2;
+  
+  int sum = 0;
+
+  const int* ptr = (int*)&Omega(bxy(0)-r,bxy(1)-r);
+  const int ofs = (Omega.width()-patchWidth);
+
+  for(int j=0;j<patchWidth;j++)
+  {
+    for(int i=0;i<patchWidth;i++)
+    {
+      sum += (*ptr);
+      ptr++;
+    }
+    ptr += ofs;
+  }
+
+  return sum;
+}
+
+template<typename FUNC>
+bool tryPatch(FUNC patchError,const V2i& sizeA,int patchWidth,const V2i& axy,const V2i& bxy,A2V2i& N,A2f& E,A2i& Omega,float omegaBest,float lambda)
+{
+  const float curOcc = (float(patchOmega(patchWidth,N(axy),Omega))/float(patchWidth*patchWidth))/omegaBest;
+  const float newOcc = (float(patchOmega(patchWidth,   bxy,Omega))/float(patchWidth*patchWidth))/omegaBest;
+    
+  const float curErr = E(axy);
+  const float newErr = patchError(patchWidth,axy,bxy,curErr+lambda*curOcc);
+
+  if ((newErr+lambda*newOcc) < (curErr+lambda*curOcc))
+  {
+    updateOmega(Omega,sizeA,patchWidth,axy,bxy   ,+1);
+    updateOmega(Omega,sizeA,patchWidth,axy,N(axy),-1);
+    N(axy) = bxy;
+    E(axy) = newErr;
+  }
+
+  return true;
+}
+
+template<typename FUNC>
+void patchmatch(const V2i&  sizeA,
+                const V2i&  sizeB,
+                const int   patchWidth,
+                FUNC        patchError,
+                const float lambda,
+                const int   numIters,
+                const int   numThreads,
+                A2V2i& N,
+                A2f&   E)
+{
+  const int w = patchWidth;
+    
+  E = nnfError(N,patchWidth,patchError);
+  
+  const float sra = 0.5f;
+  
+  std::vector<int> irad;
+  
+  irad.push_back((sizeB(0) > sizeB(1) ? sizeB(0) : sizeB(1)));
+  
+  while (irad.back() != 1) irad.push_back(int(std::pow(sra, int(irad.size())) * irad[0]));
+  
+  const int nir = int(irad.size());
+  
+  const int numThreads_ = numThreads<1 ? omp_get_max_threads() : numThreads;
+  const int minTileHeight = 8;
+  const int numTiles = int(ceil(float(sizeA(1))/float(numThreads_))) > minTileHeight ? numThreads_ : std::max(int(ceil(float(sizeA(1))/float(minTileHeight))),1);
+  const int tileHeight = sizeA(1)/numTiles;
+
+  const float omegaBest = (float(sizeA(0)*sizeA(1)) /
+                           float(sizeB(0)*sizeB(1))) * float(patchWidth*patchWidth);
+  A2i Omega(sizeB);
+  fill(&Omega,(int)0);
+  for(int y=0;y<sizeA(1);y++)
+  for(int x=0;x<sizeA(0);x++)
+  {
+    updateOmega(Omega,sizeA,w,V2i(x,y),N(x,y),+1);
+  }
+
+  for (int iter = 0; iter < numIters; iter++)
+  {
+    const int iter_seed = rand();
+    
+    #pragma omp parallel num_threads(numTiles)
+    {
+      const bool odd = (iter%2 == 0);
+      
+      const int threadId = omp_get_thread_num();
+            
+      const int _y0 = threadId*tileHeight;
+      const int _y1 = threadId==numTiles-1 ? sizeA(1) : std::min(_y0+tileHeight,sizeA(1));
+      
+      const int q  = odd ? 1 : -1;
+      const int x0 = odd ? 0 : sizeA(0)-1;
+      const int y0 = odd ? _y0 : _y1-1;
+      const int x1 = odd ? sizeA(0) : -1;
+      const int y1 = odd ? _y1 : _y0-1;
+      
+      for (int y = y0; y != y1; y += q)
+      for (int x = x0; x != x1; x += q)
+      {        
+        if (odd ? (x > 0) : (x < sizeA(0)-1))
+        {
+          V2i n = N(x-q,y); n[0] += q;
+          
+          if (odd ? (n[0] < sizeB(0)-w/2) : (n[0] >= w/2))
+          {
+            tryPatch(patchError,sizeA,w,V2i(x,y),n,N,E,Omega,omegaBest,lambda);
+          }
+        }
+        
+        if (odd ? (y > 0) : (y <sizeA(1)-1))
+        {
+          V2i n = N(x,y-q); n[1] += q;
+          
+          if (odd ? (n[1] < sizeB(1)-w/2) : (n[1] >= w/2))
+          {
+            tryPatch(patchError,sizeA,w,V2i(x,y),n,N,E,Omega,omegaBest,lambda);
+          }
+        }
+           
+        #define RANDI(u) (18000 * ((u) & 65535) + ((u) >> 16))
+
+        unsigned int seed = (x | (y<<11)) ^ iter_seed;
+        seed = RANDI(seed);
+      
+        const V2i pix0 = N(x,y);
+        //for (int i = 0; i < nir; i++)
+        for (int i = nir-1; i >=0; i--)
+        {
+          V2i tl = pix0 - V2i(irad[i], irad[i]);
+          V2i br = pix0 + V2i(irad[i], irad[i]);
+          
+          tl = std::max(tl,V2i(w/2,w/2));
+          br = std::min(br,sizeB-V2i(w/2,w/2));
+          
+          const int _rndX = RANDI(seed);
+          const int _rndY = RANDI(_rndX);
+          seed=_rndY;
+          
+          const V2i n = V2i
+          (
+            tl[0] + (_rndX % (br[0]-tl[0])),
+            tl[1] + (_rndY % (br[1]-tl[1]))
+          );
+        
+          tryPatch(patchError,sizeA,w,V2i(x,y),n,N,E,Omega,omegaBest,lambda);
+        }
+
+        #undef RANDI
+      }
+    } 
+  }
+}
+
+template<int NS,int NG>
+void ebsynthCpu(int    numStyleChannels,
+                int    numGuideChannels,
+                int    sourceWidth,
+                int    sourceHeight,
+                void*  sourceStyleData,
+                void*  sourceGuideData,
+                int    targetWidth,
+                int    targetHeight,
+                void*  targetGuideData,
+                void*  targetModulationData,
+                float* styleWeights,
+                float* guideWeights,
+                float  uniformityWeight,
+                int    patchSize,
+                int    voteMode,
+                int    numPyramidLevels,
+                int*   numSearchVoteItersPerLevel,
+                int*   numPatchMatchItersPerLevel,
+                int*   stopThresholdPerLevel,
+                void*  outputNnfData,
+                void*  outputImageData)
+{
+  const int levelCount = numPyramidLevels;
+
+  struct PyramidLevel
+  {
+    PyramidLevel() { }
+
+    int sourceWidth;
+    int sourceHeight;
+    int targetWidth;
+    int targetHeight;
+
+    Array2<Vec<NS,unsigned char>> sourceStyle;
+    Array2<Vec<NG,unsigned char>> sourceGuide;
+    Array2<Vec<NS,unsigned char>> targetStyle;
+    Array2<Vec<NS,unsigned char>> targetStyle2;
+    //Array2<unsigned char>         mask;
+    //Array2<unsigned char>         mask2;
+    Array2<Vec<NG,unsigned char>> targetGuide;
+    Array2<Vec<NG,unsigned char>> targetModulation;
+    Array2<Vec<2,int>>            NNF;
+    //Array2<Vec<2,int>>            NNF2;
+    Array2<float>                 E;
+    //Array2<int>                   Omega;
+  };
+
+  std::vector<PyramidLevel> pyramid(levelCount);
+  for(int level=0;level<levelCount;level++)
+  {
+    const V2i levelSourceSize = pyramidLevelSize(V2i(sourceWidth,sourceHeight),levelCount,level);
+    const V2i levelTargetSize = pyramidLevelSize(V2i(targetWidth,targetHeight),levelCount,level);
+
+    pyramid[level].sourceWidth  = levelSourceSize(0);
+    pyramid[level].sourceHeight = levelSourceSize(1);
+    pyramid[level].targetWidth  = levelTargetSize(0);
+    pyramid[level].targetHeight = levelTargetSize(1);
+  }
+
+  pyramid[levelCount-1].sourceStyle  = Array2<Vec<NS,unsigned char>>(V2i(pyramid[levelCount-1].sourceWidth,pyramid[levelCount-1].sourceHeight));
+  pyramid[levelCount-1].sourceGuide  = Array2<Vec<NG,unsigned char>>(V2i(pyramid[levelCount-1].sourceWidth,pyramid[levelCount-1].sourceHeight));
+  pyramid[levelCount-1].targetGuide  = Array2<Vec<NG,unsigned char>>(V2i(pyramid[levelCount-1].targetWidth,pyramid[levelCount-1].targetHeight));
+
+  copy(&pyramid[levelCount-1].sourceStyle,sourceStyleData);
+  copy(&pyramid[levelCount-1].sourceGuide,sourceGuideData);
+  copy(&pyramid[levelCount-1].targetGuide,targetGuideData);
+
+  if (targetModulationData)
+  {
+    pyramid[levelCount-1].targetModulation = Array2<Vec<NG,unsigned char>>(V2i(pyramid[levelCount-1].targetWidth,pyramid[levelCount-1].targetHeight));
+    copy(&pyramid[levelCount-1].targetModulation,targetModulationData); 
+  }
+
+  //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+  bool inExtraPass = false;
+
+  for (int level=0;level<pyramid.size();level++)
+  {
+    const V2i levelSourceSize = V2i(pyramid[level].sourceWidth,pyramid[level].sourceHeight);
+    const V2i levelTargetSize = V2i(pyramid[level].targetWidth,pyramid[level].targetHeight);
+
+    pyramid[level].targetStyle  = Array2<Vec<NS,unsigned char>>(levelTargetSize);
+    pyramid[level].targetStyle2 = Array2<Vec<NS,unsigned char>>(levelTargetSize);
+    //pyramid[level].mask         = Array2<unsigned char>(levelTargetSize);
+    //pyramid[level].mask2        = Array2<unsigned char>(levelTargetSize);
+    pyramid[level].NNF          = Array2<Vec<2,int>>(levelTargetSize);
+    //pyramid[level].NNF2         = Array2<Vec<2,int>>(levelTargetSize);
+    //pyramid[level].Omega        = Array2<int>(levelSourceSize);
+    pyramid[level].E            = Array2<float>(levelTargetSize);
+ 
+    if (level<levelCount-1)
+    {
+      pyramid[level].sourceStyle  = Array2<Vec<NS,unsigned char>>(levelSourceSize);
+      pyramid[level].sourceGuide  = Array2<Vec<NG,unsigned char>>(levelSourceSize);
+      pyramid[level].targetGuide  = Array2<Vec<NG,unsigned char>>(levelTargetSize);
+
+      resampleCPU(pyramid[level].sourceStyle,pyramid[levelCount-1].sourceStyle);
+      resampleCPU(pyramid[level].sourceGuide,pyramid[levelCount-1].sourceGuide);
+      resampleCPU(pyramid[level].targetGuide,pyramid[levelCount-1].targetGuide);
+
+      if (targetModulationData)
+      {
+        resampleCPU(pyramid[level].targetModulation,pyramid[levelCount-1].targetModulation);
+        pyramid[level].targetModulation = Array2<Vec<NG,unsigned char>>(levelTargetSize);
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////////
+
+    if (!inExtraPass)
+    {
+      A2V2i cpu_NNF;
+      if (level>0)
+      {
+        pyramid[level].NNF = nnfUpscale(pyramid[level-1].NNF,
+                                        patchSize,
+                                        V2i(pyramid[level].targetWidth,pyramid[level].targetHeight),
+                                        V2i(pyramid[level].sourceWidth,pyramid[level].sourceHeight));
+        
+        pyramid[level-1].NNF = A2V2i();
+      }
+      else
+      {
+        pyramid[level].NNF = nnfInitRandom(V2i(pyramid[level].targetWidth,pyramid[level].targetHeight),
+                                           V2i(pyramid[level].sourceWidth,pyramid[level].sourceHeight),
+                                           patchSize);
+      }
+
+      /////////////////////////////////////////////////////////////////////////
+      /*
+      Array2<int> cpu_Omega(pyramid[level].sourceWidth,pyramid[level].sourceHeight);
+
+      fill(&cpu_Omega,(int)0);
+      for(int ay=0;ay<cpu_NNF.height();ay++)
+      for(int ax=0;ax<cpu_NNF.width();ax++)
+      {
+        const V2i& n = cpu_NNF(ax,ay);
+        const int bx = n(0);
+        const int by = n(1);
+
+        const int r = patchSize/2;
+
+        for(int oy=-r;oy<=+r;oy++)
+        for(int ox=-r;ox<=+r;ox++)
+        {
+          const int x = bx+ox;
+          const int y = by+oy;
+          cpu_Omega(x,y) += 1;
+        }
+      }
+
+      copy(&pyramid[level].Omega,cpu_Omega);
+      */
+      /////////////////////////////////////////////////////////////////////////
+    }
+
+    ////////////////////////////////////////////////////////////////////////////
+    {
+      krnlVotePlain(pyramid[level].targetStyle2,
+                    pyramid[level].sourceStyle,
+                    pyramid[level].NNF,
+                    patchSize);
+
+      std::swap(pyramid[level].targetStyle2,pyramid[level].targetStyle);
+    }
+    ////////////////////////////////////////////////////////////////////////////
+
+    //Array2<Vec<1,unsigned char>> cpu_mask(V2i(pyramid[level].targetWidth,pyramid[level].targetHeight));
+    //fill(&cpu_mask,Vec<1,unsigned char>(255));
+    //copy(&pyramid[level].mask,cpu_mask);
+
+    ////////////////////////////////////////////////////////////////////////////
+
+    for (int voteIter=0;voteIter<numSearchVoteItersPerLevel[level];voteIter++)
+    {
+      Vec<NS,float> styleWeightsVec;
+      for(int i=0;i<NS;i++) { styleWeightsVec[i] = styleWeights[i]; }
+
+      Vec<NG,float> guideWeightsVec;
+      for(int i=0;i<NG;i++) { guideWeightsVec[i] = guideWeights[i]; }
+
+      //if (numPatchMatchItersPerLevel[level]>0)
+      {
+        /*if (targetModulationData)
+        {
+          patchmatchGPU(V2i(pyramid[level].targetWidth,pyramid[level].targetHeight),
+                        V2i(pyramid[level].sourceWidth,pyramid[level].sourceHeight),
+                        pyramid[level].Omega,
+                        patchSize,
+                        PatchSSD_Split_Modulation<NS,NG,unsigned char>(pyramid[level].targetStyle,
+                                                                       pyramid[level].sourceStyle,
+                                                                       pyramid[level].targetGuide,
+                                                                       pyramid[level].sourceGuide,
+                                                                       pyramid[level].targetModulation,
+                                                                       styleWeightsVec,
+                                                                       guideWeightsVec),
+                        uniformityWeight,
+                        numPatchMatchItersPerLevel[level],
+                        numGpuThreadsPerBlock,
+                        pyramid[level].NNF,
+                        pyramid[level].NNF2,
+                        pyramid[level].E,
+                        pyramid[level].mask,
+                        rngStates);
+        }
+        else*/
+        {
+          patchmatch(V2i(pyramid[level].targetWidth,pyramid[level].targetHeight),
+                     V2i(pyramid[level].sourceWidth,pyramid[level].sourceHeight),
+                     patchSize,
+                     PatchSSD_Split<NS,NG,unsigned char>(pyramid[level].targetStyle,
+                                                         pyramid[level].sourceStyle,
+                                                         pyramid[level].targetGuide,
+                                                         pyramid[level].sourceGuide,
+                                                         styleWeightsVec,
+                                                         guideWeightsVec),
+                     uniformityWeight,                             
+                     numPatchMatchItersPerLevel[level],
+                     -1,
+                     pyramid[level].NNF,
+                     pyramid[level].E);                         
+        }
+      }
+      /*
+      else
+      {       
+        if (targetModulationData)
+        {
+          krnlEvalErrorPass<<<numBlocks,threadsPerBlock>>>(patchSize,
+                                                           PatchSSD_Split_Modulation<NS,NG,unsigned char>(pyramid[level].targetStyle,
+                                                                                                          pyramid[level].sourceStyle,
+                                                                                                          pyramid[level].targetGuide,
+                                                                                                          pyramid[level].sourceGuide,
+                                                                                                          pyramid[level].targetModulation,
+                                                                                                          styleWeightsVec,
+                                                                                                          guideWeightsVec),
+                                                           pyramid[level].NNF,
+                                                           pyramid[level].E);
+        }
+        else
+        {
+          krnlEvalErrorPass<<<numBlocks,threadsPerBlock>>>(patchSize,
+                                                           PatchSSD_Split<NS,NG,unsigned char>(pyramid[level].targetStyle,
+                                                                                               pyramid[level].sourceStyle,
+                                                                                               pyramid[level].targetGuide,
+                                                                                               pyramid[level].sourceGuide,
+                                                                                               styleWeightsVec,
+                                                                                               guideWeightsVec),
+                                                           pyramid[level].NNF,
+                                                           pyramid[level].E);
+        }
+        checkCudaError( cudaDeviceSynchronize() );        
+      }
+      */
+      {
+        //if      (voteMode==EBSYNTH_VOTEMODE_PLAIN)
+        {
+          krnlVotePlain(pyramid[level].targetStyle2,
+                        pyramid[level].sourceStyle,
+                        pyramid[level].NNF,
+                        patchSize);
+        }
+        /*else if (voteMode==EBSYNTH_VOTEMODE_WEIGHTED)
+        {
+          krnlVoteWeighted<<<numBlocks,threadsPerBlock>>>(pyramid[level].targetStyle2,
+                                                          pyramid[level].sourceStyle,
+                                                          pyramid[level].NNF,
+                                                          pyramid[level].E,
+                                                          patchSize);
+        }*/
+
+        std::swap(pyramid[level].targetStyle2,pyramid[level].targetStyle);
+
+        /*
+        if (voteIter<numSearchVoteItersPerLevel[level]-1)
+        {
+          krnlEvalMask<<<numBlocks,threadsPerBlock>>>(pyramid[level].mask,
+                                                      pyramid[level].targetStyle,
+                                                      pyramid[level].targetStyle2,
+                                                      stopThresholdPerLevel[level]);
+          checkCudaError( cudaDeviceSynchronize() );
+
+          krnlDilateMask<<<numBlocks,threadsPerBlock>>>(pyramid[level].mask2,
+                                                        pyramid[level].mask,
+                                                        patchSize);
+          std::swap(pyramid[level].mask2,pyramid[level].mask);
+          checkCudaError( cudaDeviceSynchronize() );
+        }
+        */
+      }
+    }
+
+    if (level==levelCount-1)
+    {
+      if (outputNnfData!=NULL) { copy(&outputNnfData,pyramid[level].NNF); }
+      copy(&outputImageData,pyramid[level].targetStyle);
+    }
+
+    pyramid[level].sourceStyle = Array2<Vec<NS,unsigned char>>();
+    pyramid[level].sourceGuide = Array2<Vec<NG,unsigned char>>();
+    pyramid[level].targetGuide = Array2<Vec<NG,unsigned char>>();
+    pyramid[level].targetStyle = Array2<Vec<NS,unsigned char>>();
+    pyramid[level].targetStyle2 = Array2<Vec<NS,unsigned char>>();
+    //pyramid[level].mask = Array2<unsigned char>();
+    //pyramid[level].mask2 = Array2<unsigned char>();
+    //pyramid[level].NNF2 = Array2<Vec<2,int>>();
+    //pyramid[level].Omega = Array2<int>();
+    pyramid[level].E = Array2<float>();
+    if (targetModulationData) { pyramid[level].targetModulation = Array2<Vec<NG,unsigned char>>(); }
+  }
+
+  pyramid[levelCount-1].NNF = Array2<Vec<2,int>>();
+}
+
+void ebsynthRunCpu(int    numStyleChannels,
+                   int    numGuideChannels,
+                   int    sourceWidth,
+                   int    sourceHeight,
+                   void*  sourceStyleData,
+                   void*  sourceGuideData,
+                   int    targetWidth,
+                   int    targetHeight,
+                   void*  targetGuideData,
+                   void*  targetModulationData,
+                   float* styleWeights,
+                   float* guideWeights,
+                   float  uniformityWeight,
+                   int    patchSize,
+                   int    voteMode,
+                   int    numPyramidLevels,
+                   int*   numSearchVoteItersPerLevel,
+                   int*   numPatchMatchItersPerLevel,
+                   int*   stopThresholdPerLevel,
+                   void*  outputNnfData,
+                   void*  outputImageData)
+{
+  void (*const dispatchEbsynth[EBSYNTH_MAX_GUIDE_CHANNELS][EBSYNTH_MAX_STYLE_CHANNELS])(int,int,int,int,void*,void*,int,int,void*,void*,float*,float*,float,int,int,int,int*,int*,int*,void*,void*) =
+  {
+    { ebsynthCpu<1, 1>, ebsynthCpu<2, 1>, ebsynthCpu<3, 1>, ebsynthCpu<4, 1>, ebsynthCpu<5, 1>, ebsynthCpu<6, 1>, ebsynthCpu<7, 1>, ebsynthCpu<8, 1> },
+    { ebsynthCpu<1, 2>, ebsynthCpu<2, 2>, ebsynthCpu<3, 2>, ebsynthCpu<4, 2>, ebsynthCpu<5, 2>, ebsynthCpu<6, 2>, ebsynthCpu<7, 2>, ebsynthCpu<8, 2> },
+    { ebsynthCpu<1, 3>, ebsynthCpu<2, 3>, ebsynthCpu<3, 3>, ebsynthCpu<4, 3>, ebsynthCpu<5, 3>, ebsynthCpu<6, 3>, ebsynthCpu<7, 3>, ebsynthCpu<8, 3> },
+    { ebsynthCpu<1, 4>, ebsynthCpu<2, 4>, ebsynthCpu<3, 4>, ebsynthCpu<4, 4>, ebsynthCpu<5, 4>, ebsynthCpu<6, 4>, ebsynthCpu<7, 4>, ebsynthCpu<8, 4> },
+    { ebsynthCpu<1, 5>, ebsynthCpu<2, 5>, ebsynthCpu<3, 5>, ebsynthCpu<4, 5>, ebsynthCpu<5, 5>, ebsynthCpu<6, 5>, ebsynthCpu<7, 5>, ebsynthCpu<8, 5> },
+    { ebsynthCpu<1, 6>, ebsynthCpu<2, 6>, ebsynthCpu<3, 6>, ebsynthCpu<4, 6>, ebsynthCpu<5, 6>, ebsynthCpu<6, 6>, ebsynthCpu<7, 6>, ebsynthCpu<8, 6> },
+    { ebsynthCpu<1, 7>, ebsynthCpu<2, 7>, ebsynthCpu<3, 7>, ebsynthCpu<4, 7>, ebsynthCpu<5, 7>, ebsynthCpu<6, 7>, ebsynthCpu<7, 7>, ebsynthCpu<8, 7> },
+    { ebsynthCpu<1, 8>, ebsynthCpu<2, 8>, ebsynthCpu<3, 8>, ebsynthCpu<4, 8>, ebsynthCpu<5, 8>, ebsynthCpu<6, 8>, ebsynthCpu<7, 8>, ebsynthCpu<8, 8> },
+    { ebsynthCpu<1, 9>, ebsynthCpu<2, 9>, ebsynthCpu<3, 9>, ebsynthCpu<4, 9>, ebsynthCpu<5, 9>, ebsynthCpu<6, 9>, ebsynthCpu<7, 9>, ebsynthCpu<8, 9> },
+    { ebsynthCpu<1,10>, ebsynthCpu<2,10>, ebsynthCpu<3,10>, ebsynthCpu<4,10>, ebsynthCpu<5,10>, ebsynthCpu<6,10>, ebsynthCpu<7,10>, ebsynthCpu<8,10> },
+    { ebsynthCpu<1,11>, ebsynthCpu<2,11>, ebsynthCpu<3,11>, ebsynthCpu<4,11>, ebsynthCpu<5,11>, ebsynthCpu<6,11>, ebsynthCpu<7,11>, ebsynthCpu<8,11> },
+    { ebsynthCpu<1,12>, ebsynthCpu<2,12>, ebsynthCpu<3,12>, ebsynthCpu<4,12>, ebsynthCpu<5,12>, ebsynthCpu<6,12>, ebsynthCpu<7,12>, ebsynthCpu<8,12> },
+    { ebsynthCpu<1,13>, ebsynthCpu<2,13>, ebsynthCpu<3,13>, ebsynthCpu<4,13>, ebsynthCpu<5,13>, ebsynthCpu<6,13>, ebsynthCpu<7,13>, ebsynthCpu<8,13> },
+    { ebsynthCpu<1,14>, ebsynthCpu<2,14>, ebsynthCpu<3,14>, ebsynthCpu<4,14>, ebsynthCpu<5,14>, ebsynthCpu<6,14>, ebsynthCpu<7,14>, ebsynthCpu<8,14> },
+    { ebsynthCpu<1,15>, ebsynthCpu<2,15>, ebsynthCpu<3,15>, ebsynthCpu<4,15>, ebsynthCpu<5,15>, ebsynthCpu<6,15>, ebsynthCpu<7,15>, ebsynthCpu<8,15> },
+    { ebsynthCpu<1,16>, ebsynthCpu<2,16>, ebsynthCpu<3,16>, ebsynthCpu<4,16>, ebsynthCpu<5,16>, ebsynthCpu<6,16>, ebsynthCpu<7,16>, ebsynthCpu<8,16> },
+    { ebsynthCpu<1,17>, ebsynthCpu<2,17>, ebsynthCpu<3,17>, ebsynthCpu<4,17>, ebsynthCpu<5,17>, ebsynthCpu<6,17>, ebsynthCpu<7,17>, ebsynthCpu<8,17> },
+    { ebsynthCpu<1,18>, ebsynthCpu<2,18>, ebsynthCpu<3,18>, ebsynthCpu<4,18>, ebsynthCpu<5,18>, ebsynthCpu<6,18>, ebsynthCpu<7,18>, ebsynthCpu<8,18> },
+    { ebsynthCpu<1,19>, ebsynthCpu<2,19>, ebsynthCpu<3,19>, ebsynthCpu<4,19>, ebsynthCpu<5,19>, ebsynthCpu<6,19>, ebsynthCpu<7,19>, ebsynthCpu<8,19> },
+    { ebsynthCpu<1,20>, ebsynthCpu<2,20>, ebsynthCpu<3,20>, ebsynthCpu<4,20>, ebsynthCpu<5,20>, ebsynthCpu<6,20>, ebsynthCpu<7,20>, ebsynthCpu<8,20> },
+    { ebsynthCpu<1,21>, ebsynthCpu<2,21>, ebsynthCpu<3,21>, ebsynthCpu<4,21>, ebsynthCpu<5,21>, ebsynthCpu<6,21>, ebsynthCpu<7,21>, ebsynthCpu<8,21> },
+    { ebsynthCpu<1,22>, ebsynthCpu<2,22>, ebsynthCpu<3,22>, ebsynthCpu<4,22>, ebsynthCpu<5,22>, ebsynthCpu<6,22>, ebsynthCpu<7,22>, ebsynthCpu<8,22> },
+    { ebsynthCpu<1,23>, ebsynthCpu<2,23>, ebsynthCpu<3,23>, ebsynthCpu<4,23>, ebsynthCpu<5,23>, ebsynthCpu<6,23>, ebsynthCpu<7,23>, ebsynthCpu<8,23> },
+    { ebsynthCpu<1,24>, ebsynthCpu<2,24>, ebsynthCpu<3,24>, ebsynthCpu<4,24>, ebsynthCpu<5,24>, ebsynthCpu<6,24>, ebsynthCpu<7,24>, ebsynthCpu<8,24> }
+  };
+
+  if (numStyleChannels>=1 && numStyleChannels<=EBSYNTH_MAX_STYLE_CHANNELS &&
+      numGuideChannels>=1 && numGuideChannels<=EBSYNTH_MAX_GUIDE_CHANNELS)
+  {
+    dispatchEbsynth[numGuideChannels-1][numStyleChannels-1](numStyleChannels,
+                                                            numGuideChannels,
+                                                            sourceWidth,
+                                                            sourceHeight,
+                                                            sourceStyleData,
+                                                            sourceGuideData,
+                                                            targetWidth,
+                                                            targetHeight,
+                                                            targetGuideData,
+                                                            targetModulationData,
+                                                            styleWeights,
+                                                            guideWeights,
+                                                            uniformityWeight,
+                                                            patchSize,
+                                                            voteMode,
+                                                            numPyramidLevels,
+                                                            numSearchVoteItersPerLevel,
+                                                            numPatchMatchItersPerLevel,
+                                                            stopThresholdPerLevel,
+                                                            outputNnfData,
+                                                            outputImageData);
+  }
+}
+
+int ebsynthBackendAvailableCpu()
+{
+  return 1;
+}
diff --git a/src/ebsynth_cpu.h b/src/ebsynth_cpu.h
new file mode 100644
index 0000000..03dbfcf
--- /dev/null
+++ b/src/ebsynth_cpu.h
@@ -0,0 +1,32 @@
+// This software is in the public domain. Where that dedication is not
+// recognized, you are granted a perpetual, irrevocable license to copy
+// and modify this file as you see fit.
+
+#ifndef EBSYNTH_CPU_H_
+#define EBSYNTH_CPU_H_
+
+void ebsynthRunCpu(int    numStyleChannels,
+                   int    numGuideChannels,
+                   int    sourceWidth,
+                   int    sourceHeight,
+                   void*  sourceStyleData,
+                   void*  sourceGuideData,
+                   int    targetWidth,
+                   int    targetHeight,
+                   void*  targetGuideData,
+                   void*  targetModulationData,
+                   float* styleWeights,
+                   float* guideWeights,
+                   float  uniformityWeight,
+                   int    patchSize,
+                   int    voteMode,
+                   int    numPyramidLevels,
+                   int*   numSearchVoteItersPerLevel,
+                   int*   numPatchMatchItersPerLevel,
+                   int*   stopThresholdPerLevel,
+                   void*  outputNnfData,
+                   void*  outputImageData);
+
+int ebsynthBackendAvailableCpu();
+
+#endif
diff --git a/src/ebsynth.cu b/src/ebsynth_cuda.cu
similarity index 56%
rename from src/ebsynth.cu
rename to src/ebsynth_cuda.cu
index 582d6bb..b0ef1bb 100644
--- a/src/ebsynth.cu
+++ b/src/ebsynth_cuda.cu
@@ -3,11 +3,369 @@
 // and modify this file as you see fit.
 
 #include "ebsynth.h"
-#include "patchmatch_gpu.h"
+#include "ebsynth_cuda_texarray2.h"
+#include "ebsynth_cuda_memarray2.h"
+
+#include <cmath>
+#include <cfloat>
+#include <stdint.h>
 
 #define FOR(A,X,Y) for(int Y=0;Y<A.height();Y++) for(int X=0;X<A.width();X++)
 
-A2V2i nnfInitRandom(const V2i& targetSize,
+typedef Vec<1,float> V1f;
+typedef Array2<Vec<1,float>> A2V1f;
+
+struct pcgState
+{
+  uint64_t state;
+  uint64_t increment;
+};
+
+__device__ void pcgAdvance(pcgState* rng)
+{
+  rng->state = rng->state * 6364136223846793005ULL + rng->increment;
+}
+
+__device__ uint32_t pcgOutput(uint64_t state)
+{
+  return (uint32_t)(((state >> 22u) ^ state) >> ((state >> 61u) + 22u));
+}
+
+__device__ uint32_t pcgRand(pcgState* rng)
+{
+  uint64_t oldstate = rng->state;
+  pcgAdvance(rng);
+  return pcgOutput(oldstate);
+}
+
+__device__ void pcgInit(pcgState* rng,uint64_t seed,uint64_t stream)
+{
+  rng->state = 0U;
+  rng->increment = (stream << 1u) | 1u;
+  pcgAdvance(rng);
+  rng->state += seed;
+  pcgAdvance(rng);
+}
+
+__global__ void krnlInitRngStates(const int width,
+                                  const int height,
+                                  pcgState* rngStates)
+{
+  const int x = blockDim.x*blockIdx.x + threadIdx.x;
+  const int y = blockDim.y*blockIdx.y + threadIdx.y;
+
+  if (x<width && y<height)
+  {
+    const int idx = x+y*width;
+    pcgInit(&rngStates[idx],1337,idx);    
+  }
+}
+
+pcgState* initGpuRng(const int width,
+                     const int height)
+{
+  pcgState* gpuRngStates;
+  cudaMalloc(&gpuRngStates,width*height*sizeof(pcgState));
+
+  const dim3 threadsPerBlock(16,16);
+  const dim3 numBlocks((width+threadsPerBlock.x)/threadsPerBlock.x,
+                       (height+threadsPerBlock.y)/threadsPerBlock.y);
+
+  krnlInitRngStates<<<numBlocks,threadsPerBlock>>>(width,height,gpuRngStates);
+
+  return gpuRngStates;
+}
+
+template<typename FUNC>
+__global__ void krnlEvalErrorPass(const int patchWidth,
+                                  FUNC patchError,
+                                  const TexArray2<2,int> NNF,
+                                  TexArray2<1,float> E)
+{
+  const int x = blockDim.x*blockIdx.x + threadIdx.x;
+  const int y = blockDim.y*blockIdx.y + threadIdx.y;
+
+  if (x<NNF.width && y<NNF.height)
+  {
+    const V2i n = NNF(x,y);
+    E.write(x,y,V1f(patchError(patchWidth,x,y,n[0],n[1],FLT_MAX)));
+  }
+}
+
+void __device__ updateOmega(MemArray2<int>& Omega,const int patchWidth,const int bx,const int by,const int incdec)
+{
+  const int r = patchWidth/2;
+
+  for(int oy=-r;oy<=+r;oy++)
+  for(int ox=-r;ox<=+r;ox++)
+  {
+    const int x = bx+ox;
+    const int y = by+oy;
+    atomicAdd(&Omega.data[x+y*Omega.width],incdec);
+    //Omega.data[x+y*Omega.width] += incdec;
+  }
+}
+
+int __device__ patchOmega(const int patchWidth,const int bx,const int by,const MemArray2<int>& Omega)
+{
+  const int r = patchWidth/2;
+
+  int sum = 0;
+
+  for(int oy=-r;oy<=+r;oy++)
+  for(int ox=-r;ox<=+r;ox++)
+  {
+    const int x = bx+ox;
+    const int y = by+oy;
+    sum += Omega.data[x+y*Omega.width]; /// XXX: atomic read instead ??
+  }
+
+  return sum;
+}
+
+template<typename FUNC>
+__device__ void tryPatch(const  V2i& sizeA,
+                         const  V2i& sizeB,
+                                MemArray2<int>& Omega,
+                         const  int patchWidth,
+                         FUNC   patchError,
+                         const  float lambda,
+                         const  int ax,
+                         const  int ay,
+                         const  int bx,
+                         const  int by,
+                         V2i&   nbest,
+                         float& ebest)
+{
+  const float omegaBest = (float(sizeA(0)*sizeA(1)) /
+                           float(sizeB(0)*sizeB(1))) * float(patchWidth*patchWidth);
+
+  const float curOcc = (float(patchOmega(patchWidth,nbest(0),nbest(1),Omega))/float(patchWidth*patchWidth))/omegaBest;
+  const float newOcc = (float(patchOmega(patchWidth,      bx,      by,Omega))/float(patchWidth*patchWidth))/omegaBest;
+
+  const float curErr = ebest;
+  const float newErr = patchError(patchWidth,ax,ay,bx,by,curErr+lambda*curOcc);
+
+  if ((newErr+lambda*newOcc) < (curErr+lambda*curOcc))
+  {
+    updateOmega(Omega,patchWidth,      bx,      by,+1);
+    updateOmega(Omega,patchWidth,nbest(0),nbest(1),-1);
+    nbest = V2i(bx,by);
+    ebest = newErr;
+  }
+}
+
+template<typename FUNC>
+__device__ void tryNeighborsOffset(const int x,
+                                   const int y,
+                                   const int ox,
+                                   const int oy,
+                                   V2i& nbest,
+                                   float& ebest,
+                                   const V2i& sizeA,
+                                   const V2i& sizeB,
+                                         MemArray2<int>& Omega,
+                                   const int patchWidth,
+                                   FUNC patchError,
+                                   const float lambda,
+                                   const TexArray2<2,int>& NNF)
+{
+  const int hpw = patchWidth/2;
+
+  const V2i on = NNF(x+ox,y+oy);
+  const int nx = on(0)-ox;
+  const int ny = on(1)-oy;
+
+  if (nx>=hpw && nx<sizeB(0)-hpw &&
+      ny>=hpw && ny<sizeB(1)-hpw)
+  {
+    tryPatch(sizeA,sizeB,Omega,patchWidth,patchError,lambda,x,y,nx,ny,nbest,ebest);
+  }
+}
+
+template<typename FUNC>
+__global__ void krnlPropagationPass(const V2i sizeA,
+                                    const V2i sizeB,
+                                          MemArray2<int> Omega,
+                                    const int patchWidth,
+                                    FUNC  patchError,
+                                    const float lambda,
+                                    const int r,
+                                    const TexArray2<2,int> NNF,
+                                    TexArray2<2,int> NNF2,
+                                    TexArray2<1,float> E,
+                                    TexArray2<1,unsigned char> mask)
+{
+  const int x = blockDim.x*blockIdx.x + threadIdx.x;
+  const int y = blockDim.y*blockIdx.y + threadIdx.y;
+
+  if (x<sizeA(0) && y<sizeA(1))
+  {
+    V2i   nbest = NNF(x,y);
+    float ebest = E(x,y)(0);
+
+    if (mask(x,y)[0]==255)
+    {
+      tryNeighborsOffset(x,y,-r,0,nbest,ebest,sizeA,sizeB,Omega,patchWidth,patchError,lambda,NNF);
+      tryNeighborsOffset(x,y,+r,0,nbest,ebest,sizeA,sizeB,Omega,patchWidth,patchError,lambda,NNF);
+      tryNeighborsOffset(x,y,0,-r,nbest,ebest,sizeA,sizeB,Omega,patchWidth,patchError,lambda,NNF);
+      tryNeighborsOffset(x,y,0,+r,nbest,ebest,sizeA,sizeB,Omega,patchWidth,patchError,lambda,NNF);
+    }
+
+    E.write(x,y,V1f(ebest));
+    NNF2.write(x,y,nbest);
+  }
+}
+
+template<typename FUNC>
+__device__ void tryRandomOffsetInRadius(const int r,
+                                        const V2i& sizeA,
+                                        const V2i& sizeB,
+                                              MemArray2<int>& Omega,
+                                        const int patchWidth,
+                                        FUNC  patchError,
+                                        const float lambda,
+                                        const int x,
+                                        const int y,
+                                        const V2i& norg,
+                                        V2i&  nbest,
+                                        float& ebest,
+                                        pcgState* rngState)
+{
+  const int hpw = patchWidth/2;
+
+  const int xmin = max(norg(0)-r,hpw);
+  const int xmax = min(norg(0)+r,sizeB(0)-1-hpw);
+  const int ymin = max(norg(1)-r,hpw);
+  const int ymax = min(norg(1)+r,sizeB(1)-1-hpw);
+
+  const int nx = xmin+(pcgRand(rngState)%(xmax-xmin+1));
+  const int ny = ymin+(pcgRand(rngState)%(ymax-ymin+1));
+
+  tryPatch(sizeA,sizeB,Omega,patchWidth,patchError,lambda,x,y,nx,ny,nbest,ebest);
+}
+
+/*
+template<typename FUNC>
+__global__ void krnlRandomSearchPass(const V2i sizeA,
+                                     const V2i sizeB,
+                                     MemArray2<int> Omega,
+                                     const int patchWidth,
+                                     FUNC  patchError,
+                                     const float lambda,
+                                     TexArray2<2,int> NNF,
+                                     TexArray2<1,float> E,
+                                     TexArray2<1,unsigned char> mask,
+                                     pcgState* rngStates)
+{
+  const int x = blockDim.x*blockIdx.x + threadIdx.x;
+  const int y = blockDim.y*blockIdx.y + threadIdx.y;
+
+  if (x<sizeA(0) && y<sizeA(1))
+  {
+    if (mask(x,y)[0]==255)
+    {
+      V2i nbest = NNF(x,y);
+      float ebest = E(x,y)(0);
+
+      const V2i norg = nbest;
+
+      for(int r=1;r<max(sizeB(0),sizeB(1))/2;r=r*2)
+      {
+        tryRandomOffsetInRadius(r,sizeA,sizeB,Omega,patchWidth,patchError,lambda,x,y,norg,nbest,ebest,&rngStates[x+y*NNF.width]);
+      }
+
+      E.write(x,y,V1f(ebest));
+      NNF.write(x,y,nbest);
+    }
+  }
+}
+*/
+
+template<typename FUNC>
+__global__ void krnlRandomSearchPass(const V2i sizeA,
+                                     const V2i sizeB,
+                                     MemArray2<int> Omega,
+                                     const int patchWidth,
+                                     FUNC  patchError,
+                                     const float lambda,
+                                     const int radius,
+                                     TexArray2<2,int> NNF,
+                                     TexArray2<1,float> E,
+                                     TexArray2<1,unsigned char> mask,
+                                     pcgState* rngStates)
+{
+  const int x = blockDim.x*blockIdx.x + threadIdx.x;
+  const int y = blockDim.y*blockIdx.y + threadIdx.y;
+
+  if (x<sizeA(0) && y<sizeA(1))
+  {
+    if (mask(x,y)[0]==255)
+    {
+      V2i nbest = NNF(x,y);
+      float ebest = E(x,y)(0);
+
+      const V2i norg = nbest;
+
+      tryRandomOffsetInRadius(radius,sizeA,sizeB,Omega,patchWidth,patchError,lambda,x,y,norg,nbest,ebest,&rngStates[x+y*NNF.width]);
+
+      E.write(x,y,V1f(ebest));
+      NNF.write(x,y,nbest);
+    }
+  }
+}
+
+template<typename FUNC>
+void patchmatchGPU(const V2i sizeA,
+                   const V2i sizeB,
+                   MemArray2<int>& Omega,
+                   const int patchWidth,
+                   FUNC patchError,
+                   const float lambda,
+                   const int numIters,
+                   const int numThreadsPerBlock,
+                   TexArray2<2,int>& NNF,
+                   TexArray2<2,int>& NNF2,
+                   TexArray2<1,float>& E,
+                   TexArray2<1,unsigned char>& mask,
+                   pcgState* rngStates)
+{
+  const dim3 threadsPerBlock = dim3(numThreadsPerBlock,numThreadsPerBlock);
+  const dim3 numBlocks = dim3((NNF.width+threadsPerBlock.x)/threadsPerBlock.x,
+                              (NNF.height+threadsPerBlock.y)/threadsPerBlock.y);
+
+  krnlEvalErrorPass<<<numBlocks,threadsPerBlock>>>(patchWidth,patchError,NNF,E);
+
+  checkCudaError(cudaDeviceSynchronize());
+
+  for(int i=0;i<numIters;i++)
+  {
+    krnlPropagationPass<<<numBlocks,threadsPerBlock>>>(sizeA,sizeB,Omega,patchWidth,patchError,lambda,4,NNF,NNF2,E,mask); std::swap(NNF,NNF2);
+
+    checkCudaError(cudaDeviceSynchronize());
+
+    krnlPropagationPass<<<numBlocks,threadsPerBlock>>>(sizeA,sizeB,Omega,patchWidth,patchError,lambda,2,NNF,NNF2,E,mask); std::swap(NNF,NNF2);
+
+    checkCudaError(cudaDeviceSynchronize());
+
+    krnlPropagationPass<<<numBlocks,threadsPerBlock>>>(sizeA,sizeB,Omega,patchWidth,patchError,lambda,1,NNF,NNF2,E,mask); std::swap(NNF,NNF2);
+
+    checkCudaError(cudaDeviceSynchronize());
+
+    for(int r=1;r<max(sizeB(0),sizeB(1))/2;r=r*2)
+    {
+      krnlRandomSearchPass<<<numBlocks,threadsPerBlock>>>(sizeA,sizeB,Omega,patchWidth,patchError,lambda,r,NNF,E,mask,rngStates);
+    }
+
+    checkCudaError(cudaDeviceSynchronize());
+  }
+
+  krnlEvalErrorPass<<<numBlocks,threadsPerBlock>>>(patchWidth,patchError,NNF,E);
+
+  checkCudaError(cudaDeviceSynchronize());
+}
+
+static A2V2i nnfInitRandom(const V2i& targetSize,
                     const V2i& sourceSize,
                     const int  patchSize)
 {
@@ -26,7 +384,7 @@ A2V2i nnfInitRandom(const V2i& targetSize,
   return NNF;
 }
 
-A2V2i nnfUpscale(const A2V2i& NNF,
+static A2V2i nnfUpscale(const A2V2i& NNF,
                  const int    patchSize,
                  const V2i&   targetSize,
                  const V2i&   sourceSize)
@@ -381,33 +739,33 @@ struct PatchSSD_Split_Modulation
   }
 };
 
-V2i pyramidLevelSize(const V2i& sizeBase,const int numLevels,const int level)
+static V2i pyramidLevelSize(const V2i& sizeBase,const int numLevels,const int level)
 {
-  return V2i(V2f(sizeBase)*pow(2.0f,-float(numLevels-1-level)));
+  return V2i(V2f(sizeBase)*std::pow(2.0f,-float(numLevels-1-level)));
 }
 
 template<int NS,int NG>
-void runEbsynth(int    ebsynthBackend,
-                int    numStyleChannels,
-                int    numGuideChannels,
-                int    sourceWidth,
-                int    sourceHeight,
-                void*  sourceStyleData,
-                void*  sourceGuideData,
-                int    targetWidth,
-                int    targetHeight,
-                void*  targetGuideData,
-                void*  targetModulationData,
-                float* styleWeights,
-                float* guideWeights,
-                float  uniformityWeight,
-                int    patchSize,
-                int    voteMode,
-                int    numPyramidLevels,
-                int*   numSearchVoteItersPerLevel,
-                int*   numPatchMatchItersPerLevel,
-                int*   stopThresholdPerLevel,
-                void*  outputData)
+void ebsynthCuda(int    numStyleChannels,
+                 int    numGuideChannels,
+                 int    sourceWidth,
+                 int    sourceHeight,
+                 void*  sourceStyleData,
+                 void*  sourceGuideData,
+                 int    targetWidth,
+                 int    targetHeight,
+                 void*  targetGuideData,
+                 void*  targetModulationData,
+                 float* styleWeights,
+                 float* guideWeights,
+                 float  uniformityWeight,
+                 int    patchSize,
+                 int    voteMode,
+                 int    numPyramidLevels,
+                 int*   numSearchVoteItersPerLevel,
+                 int*   numPatchMatchItersPerLevel,
+                 int*   stopThresholdPerLevel,
+                 void*  outputNnfData,
+                 void*  outputImageData)
 {
   const int levelCount = numPyramidLevels;
 
@@ -706,7 +1064,11 @@ void runEbsynth(int    ebsynthBackend,
       }
     }
 
-    if (level==levelCount-1) { copy(&outputData,pyramid[pyramid.size()-1].targetStyle); }
+    if (level==levelCount-1)
+    {      
+      if (outputNnfData!=NULL) { copy(&outputNnfData,pyramid[level].NNF); }
+      copy(&outputImageData,pyramid[level].targetStyle);
+    }
 
     pyramid[level].sourceStyle.destroy();
     pyramid[level].sourceGuide.destroy();
@@ -726,62 +1088,60 @@ void runEbsynth(int    ebsynthBackend,
   checkCudaError( cudaFree(rngStates) );
 }
 
-EBSYNTH_API void ebsynthRun(int    ebsynthBackend,
-                            int    numStyleChannels,
-                            int    numGuideChannels,
-                            int    sourceWidth,
-                            int    sourceHeight,
-                            void*  sourceStyleData,
-                            void*  sourceGuideData,
-                            int    targetWidth,
-                            int    targetHeight,
-                            void*  targetGuideData,
-                            void*  targetModulationData,
-                            float* styleWeights,
-                            float* guideWeights,
-                            float  uniformityWeight,
-                            int    patchSize,
-                            int    voteMode,
-                            int    numPyramidLevels,
-                            int*   numSearchVoteItersPerLevel,
-                            int*   numPatchMatchItersPerLevel,
-                            int*   stopThresholdPerLevel,
-                            void*  outputData
-                            )
+void ebsynthRunCuda(int    numStyleChannels,
+                    int    numGuideChannels,
+                    int    sourceWidth,
+                    int    sourceHeight,
+                    void*  sourceStyleData,
+                    void*  sourceGuideData,
+                    int    targetWidth,
+                    int    targetHeight,
+                    void*  targetGuideData,
+                    void*  targetModulationData,
+                    float* styleWeights,
+                    float* guideWeights,
+                    float  uniformityWeight,
+                    int    patchSize,
+                    int    voteMode,
+                    int    numPyramidLevels,
+                    int*   numSearchVoteItersPerLevel,
+                    int*   numPatchMatchItersPerLevel,
+                    int*   stopThresholdPerLevel,
+                    void*  outputNnfData,
+                    void*  outputImageData)
 {
-  void (*const dispatchEbsynth[EBSYNTH_MAX_GUIDE_CHANNELS][EBSYNTH_MAX_STYLE_CHANNELS])(int,int,int,int,int,void*,void*,int,int,void*,void*,float*,float*,float,int,int,int,int*,int*,int*,void*) =
+  void (*const dispatchEbsynth[EBSYNTH_MAX_GUIDE_CHANNELS][EBSYNTH_MAX_STYLE_CHANNELS])(int,int,int,int,void*,void*,int,int,void*,void*,float*,float*,float,int,int,int,int*,int*,int*,void*,void*) =
   {
-    { runEbsynth<1, 1>, runEbsynth<2, 1>, runEbsynth<3, 1>, runEbsynth<4, 1>, runEbsynth<5, 1>, runEbsynth<6, 1>, runEbsynth<7, 1>, runEbsynth<8, 1> },
-    { runEbsynth<1, 2>, runEbsynth<2, 2>, runEbsynth<3, 2>, runEbsynth<4, 2>, runEbsynth<5, 2>, runEbsynth<6, 2>, runEbsynth<7, 2>, runEbsynth<8, 2> },
-    { runEbsynth<1, 3>, runEbsynth<2, 3>, runEbsynth<3, 3>, runEbsynth<4, 3>, runEbsynth<5, 3>, runEbsynth<6, 3>, runEbsynth<7, 3>, runEbsynth<8, 3> },
-    { runEbsynth<1, 4>, runEbsynth<2, 4>, runEbsynth<3, 4>, runEbsynth<4, 4>, runEbsynth<5, 4>, runEbsynth<6, 4>, runEbsynth<7, 4>, runEbsynth<8, 4> },
-    { runEbsynth<1, 5>, runEbsynth<2, 5>, runEbsynth<3, 5>, runEbsynth<4, 5>, runEbsynth<5, 5>, runEbsynth<6, 5>, runEbsynth<7, 5>, runEbsynth<8, 5> },
-    { runEbsynth<1, 6>, runEbsynth<2, 6>, runEbsynth<3, 6>, runEbsynth<4, 6>, runEbsynth<5, 6>, runEbsynth<6, 6>, runEbsynth<7, 6>, runEbsynth<8, 6> },
-    { runEbsynth<1, 7>, runEbsynth<2, 7>, runEbsynth<3, 7>, runEbsynth<4, 7>, runEbsynth<5, 7>, runEbsynth<6, 7>, runEbsynth<7, 7>, runEbsynth<8, 7> },
-    { runEbsynth<1, 8>, runEbsynth<2, 8>, runEbsynth<3, 8>, runEbsynth<4, 8>, runEbsynth<5, 8>, runEbsynth<6, 8>, runEbsynth<7, 8>, runEbsynth<8, 8> },
-    { runEbsynth<1, 9>, runEbsynth<2, 9>, runEbsynth<3, 9>, runEbsynth<4, 9>, runEbsynth<5, 9>, runEbsynth<6, 9>, runEbsynth<7, 9>, runEbsynth<8, 9> },
-    { runEbsynth<1,10>, runEbsynth<2,10>, runEbsynth<3,10>, runEbsynth<4,10>, runEbsynth<5,10>, runEbsynth<6,10>, runEbsynth<7,10>, runEbsynth<8,10> },
-    { runEbsynth<1,11>, runEbsynth<2,11>, runEbsynth<3,11>, runEbsynth<4,11>, runEbsynth<5,11>, runEbsynth<6,11>, runEbsynth<7,11>, runEbsynth<8,11> },
-    { runEbsynth<1,12>, runEbsynth<2,12>, runEbsynth<3,12>, runEbsynth<4,12>, runEbsynth<5,12>, runEbsynth<6,12>, runEbsynth<7,12>, runEbsynth<8,12> },
-    { runEbsynth<1,13>, runEbsynth<2,13>, runEbsynth<3,13>, runEbsynth<4,13>, runEbsynth<5,13>, runEbsynth<6,13>, runEbsynth<7,13>, runEbsynth<8,13> },
-    { runEbsynth<1,14>, runEbsynth<2,14>, runEbsynth<3,14>, runEbsynth<4,14>, runEbsynth<5,14>, runEbsynth<6,14>, runEbsynth<7,14>, runEbsynth<8,14> },
-    { runEbsynth<1,15>, runEbsynth<2,15>, runEbsynth<3,15>, runEbsynth<4,15>, runEbsynth<5,15>, runEbsynth<6,15>, runEbsynth<7,15>, runEbsynth<8,15> },
-    { runEbsynth<1,16>, runEbsynth<2,16>, runEbsynth<3,16>, runEbsynth<4,16>, runEbsynth<5,16>, runEbsynth<6,16>, runEbsynth<7,16>, runEbsynth<8,16> },
-    { runEbsynth<1,17>, runEbsynth<2,17>, runEbsynth<3,17>, runEbsynth<4,17>, runEbsynth<5,17>, runEbsynth<6,17>, runEbsynth<7,17>, runEbsynth<8,17> },
-    { runEbsynth<1,18>, runEbsynth<2,18>, runEbsynth<3,18>, runEbsynth<4,18>, runEbsynth<5,18>, runEbsynth<6,18>, runEbsynth<7,18>, runEbsynth<8,18> },
-    { runEbsynth<1,19>, runEbsynth<2,19>, runEbsynth<3,19>, runEbsynth<4,19>, runEbsynth<5,19>, runEbsynth<6,19>, runEbsynth<7,19>, runEbsynth<8,19> },
-    { runEbsynth<1,20>, runEbsynth<2,20>, runEbsynth<3,20>, runEbsynth<4,20>, runEbsynth<5,20>, runEbsynth<6,20>, runEbsynth<7,20>, runEbsynth<8,20> },
-    { runEbsynth<1,21>, runEbsynth<2,21>, runEbsynth<3,21>, runEbsynth<4,21>, runEbsynth<5,21>, runEbsynth<6,21>, runEbsynth<7,21>, runEbsynth<8,21> },
-    { runEbsynth<1,22>, runEbsynth<2,22>, runEbsynth<3,22>, runEbsynth<4,22>, runEbsynth<5,22>, runEbsynth<6,22>, runEbsynth<7,22>, runEbsynth<8,22> },
-    { runEbsynth<1,23>, runEbsynth<2,23>, runEbsynth<3,23>, runEbsynth<4,23>, runEbsynth<5,23>, runEbsynth<6,23>, runEbsynth<7,23>, runEbsynth<8,23> },
-    { runEbsynth<1,24>, runEbsynth<2,24>, runEbsynth<3,24>, runEbsynth<4,24>, runEbsynth<5,24>, runEbsynth<6,24>, runEbsynth<7,24>, runEbsynth<8,24> }
+    { ebsynthCuda<1, 1>, ebsynthCuda<2, 1>, ebsynthCuda<3, 1>, ebsynthCuda<4, 1>, ebsynthCuda<5, 1>, ebsynthCuda<6, 1>, ebsynthCuda<7, 1>, ebsynthCuda<8, 1> },
+    { ebsynthCuda<1, 2>, ebsynthCuda<2, 2>, ebsynthCuda<3, 2>, ebsynthCuda<4, 2>, ebsynthCuda<5, 2>, ebsynthCuda<6, 2>, ebsynthCuda<7, 2>, ebsynthCuda<8, 2> },
+    { ebsynthCuda<1, 3>, ebsynthCuda<2, 3>, ebsynthCuda<3, 3>, ebsynthCuda<4, 3>, ebsynthCuda<5, 3>, ebsynthCuda<6, 3>, ebsynthCuda<7, 3>, ebsynthCuda<8, 3> },
+    { ebsynthCuda<1, 4>, ebsynthCuda<2, 4>, ebsynthCuda<3, 4>, ebsynthCuda<4, 4>, ebsynthCuda<5, 4>, ebsynthCuda<6, 4>, ebsynthCuda<7, 4>, ebsynthCuda<8, 4> },
+    { ebsynthCuda<1, 5>, ebsynthCuda<2, 5>, ebsynthCuda<3, 5>, ebsynthCuda<4, 5>, ebsynthCuda<5, 5>, ebsynthCuda<6, 5>, ebsynthCuda<7, 5>, ebsynthCuda<8, 5> },
+    { ebsynthCuda<1, 6>, ebsynthCuda<2, 6>, ebsynthCuda<3, 6>, ebsynthCuda<4, 6>, ebsynthCuda<5, 6>, ebsynthCuda<6, 6>, ebsynthCuda<7, 6>, ebsynthCuda<8, 6> },
+    { ebsynthCuda<1, 7>, ebsynthCuda<2, 7>, ebsynthCuda<3, 7>, ebsynthCuda<4, 7>, ebsynthCuda<5, 7>, ebsynthCuda<6, 7>, ebsynthCuda<7, 7>, ebsynthCuda<8, 7> },
+    { ebsynthCuda<1, 8>, ebsynthCuda<2, 8>, ebsynthCuda<3, 8>, ebsynthCuda<4, 8>, ebsynthCuda<5, 8>, ebsynthCuda<6, 8>, ebsynthCuda<7, 8>, ebsynthCuda<8, 8> },
+    { ebsynthCuda<1, 9>, ebsynthCuda<2, 9>, ebsynthCuda<3, 9>, ebsynthCuda<4, 9>, ebsynthCuda<5, 9>, ebsynthCuda<6, 9>, ebsynthCuda<7, 9>, ebsynthCuda<8, 9> },
+    { ebsynthCuda<1,10>, ebsynthCuda<2,10>, ebsynthCuda<3,10>, ebsynthCuda<4,10>, ebsynthCuda<5,10>, ebsynthCuda<6,10>, ebsynthCuda<7,10>, ebsynthCuda<8,10> },
+    { ebsynthCuda<1,11>, ebsynthCuda<2,11>, ebsynthCuda<3,11>, ebsynthCuda<4,11>, ebsynthCuda<5,11>, ebsynthCuda<6,11>, ebsynthCuda<7,11>, ebsynthCuda<8,11> },
+    { ebsynthCuda<1,12>, ebsynthCuda<2,12>, ebsynthCuda<3,12>, ebsynthCuda<4,12>, ebsynthCuda<5,12>, ebsynthCuda<6,12>, ebsynthCuda<7,12>, ebsynthCuda<8,12> },
+    { ebsynthCuda<1,13>, ebsynthCuda<2,13>, ebsynthCuda<3,13>, ebsynthCuda<4,13>, ebsynthCuda<5,13>, ebsynthCuda<6,13>, ebsynthCuda<7,13>, ebsynthCuda<8,13> },
+    { ebsynthCuda<1,14>, ebsynthCuda<2,14>, ebsynthCuda<3,14>, ebsynthCuda<4,14>, ebsynthCuda<5,14>, ebsynthCuda<6,14>, ebsynthCuda<7,14>, ebsynthCuda<8,14> },
+    { ebsynthCuda<1,15>, ebsynthCuda<2,15>, ebsynthCuda<3,15>, ebsynthCuda<4,15>, ebsynthCuda<5,15>, ebsynthCuda<6,15>, ebsynthCuda<7,15>, ebsynthCuda<8,15> },
+    { ebsynthCuda<1,16>, ebsynthCuda<2,16>, ebsynthCuda<3,16>, ebsynthCuda<4,16>, ebsynthCuda<5,16>, ebsynthCuda<6,16>, ebsynthCuda<7,16>, ebsynthCuda<8,16> },
+    { ebsynthCuda<1,17>, ebsynthCuda<2,17>, ebsynthCuda<3,17>, ebsynthCuda<4,17>, ebsynthCuda<5,17>, ebsynthCuda<6,17>, ebsynthCuda<7,17>, ebsynthCuda<8,17> },
+    { ebsynthCuda<1,18>, ebsynthCuda<2,18>, ebsynthCuda<3,18>, ebsynthCuda<4,18>, ebsynthCuda<5,18>, ebsynthCuda<6,18>, ebsynthCuda<7,18>, ebsynthCuda<8,18> },
+    { ebsynthCuda<1,19>, ebsynthCuda<2,19>, ebsynthCuda<3,19>, ebsynthCuda<4,19>, ebsynthCuda<5,19>, ebsynthCuda<6,19>, ebsynthCuda<7,19>, ebsynthCuda<8,19> },
+    { ebsynthCuda<1,20>, ebsynthCuda<2,20>, ebsynthCuda<3,20>, ebsynthCuda<4,20>, ebsynthCuda<5,20>, ebsynthCuda<6,20>, ebsynthCuda<7,20>, ebsynthCuda<8,20> },
+    { ebsynthCuda<1,21>, ebsynthCuda<2,21>, ebsynthCuda<3,21>, ebsynthCuda<4,21>, ebsynthCuda<5,21>, ebsynthCuda<6,21>, ebsynthCuda<7,21>, ebsynthCuda<8,21> },
+    { ebsynthCuda<1,22>, ebsynthCuda<2,22>, ebsynthCuda<3,22>, ebsynthCuda<4,22>, ebsynthCuda<5,22>, ebsynthCuda<6,22>, ebsynthCuda<7,22>, ebsynthCuda<8,22> },
+    { ebsynthCuda<1,23>, ebsynthCuda<2,23>, ebsynthCuda<3,23>, ebsynthCuda<4,23>, ebsynthCuda<5,23>, ebsynthCuda<6,23>, ebsynthCuda<7,23>, ebsynthCuda<8,23> },
+    { ebsynthCuda<1,24>, ebsynthCuda<2,24>, ebsynthCuda<3,24>, ebsynthCuda<4,24>, ebsynthCuda<5,24>, ebsynthCuda<6,24>, ebsynthCuda<7,24>, ebsynthCuda<8,24> }
   };
 
   if (numStyleChannels>=1 && numStyleChannels<=EBSYNTH_MAX_STYLE_CHANNELS &&
       numGuideChannels>=1 && numGuideChannels<=EBSYNTH_MAX_GUIDE_CHANNELS)
   {
-    dispatchEbsynth[numGuideChannels-1][numStyleChannels-1](ebsynthBackend,
-                                                            numStyleChannels,
+    dispatchEbsynth[numGuideChannels-1][numStyleChannels-1](numStyleChannels,
                                                             numGuideChannels,
                                                             sourceWidth,
                                                             sourceHeight,
@@ -800,485 +1160,27 @@ EBSYNTH_API void ebsynthRun(int    ebsynthBackend,
                                                             numSearchVoteItersPerLevel,
                                                             numPatchMatchItersPerLevel,
                                                             stopThresholdPerLevel,
-                                                            outputData);
+                                                            outputNnfData,
+                                                            outputImageData);
   }
 }
 
-EBSYNTH_API
-int ebsynthBackendAvailable(int ebsynthBackend)
+int ebsynthBackendAvailableCuda()
 {
-  if (ebsynthBackend==EBSYNTH_BACKEND_CUDA)
-  {
-    int deviceCount = -1;
-    if (cudaGetDeviceCount(&deviceCount)!=cudaSuccess) { return 0; }
+  int deviceCount = -1;
+  if (cudaGetDeviceCount(&deviceCount)!=cudaSuccess) { return 0; }
 
-    for (int device=0;device<deviceCount;device++)
+  for (int device=0;device<deviceCount;device++)
+  {
+    cudaDeviceProp properties;
+    if (cudaGetDeviceProperties(&properties,device)==cudaSuccess)
     {
-      cudaDeviceProp properties;
-      if (cudaGetDeviceProperties(&properties,device)==cudaSuccess)
+      if (properties.major!=9999 && properties.major>=3)
       {
-        if (properties.major!=9999 && properties.major>=3)
-        {
-          return 1;
-        }
+        return 1;
       }
     }
   }
 
   return 0;
 }
-
-//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-#include <cstdio>
-#include <cmath>
-
-#include <vector>
-#include <string>
-#include <algorithm>
-
-#include "jzq.h"
-
-template<typename FUNC>
-bool tryToParseArg(const std::vector<std::string>& args,int* inout_argi,const char* name,bool* out_fail,FUNC handler)
-{
-  int& argi = *inout_argi;
-  bool& fail = *out_fail;
-
-  if (argi<0 || argi>=args.size()) { fail = true; return false; }
-
-  if (args[argi]==name)
-  {
-    argi++;
-    fail = !handler();    
-    return true;
-  }
-
-  fail = false; return false; 
-}
-
-bool tryToParseIntArg(const std::vector<std::string>& args,int* inout_argi,const char* name,int* out_value,bool* out_fail)
-{
-  return tryToParseArg(args,inout_argi,name,out_fail,[&]
-  {
-    int& argi = *inout_argi;
-    if (argi<args.size())
-    {
-      const std::string& arg = args[argi];
-      try
-      {
-        std::size_t pos = 0;
-        *out_value = std::stoi(arg,&pos);
-        if (pos!=arg.size()) { printf("error: bad %s argument '%s'\n",name,arg.c_str()); return false; }
-        return true;
-      }
-      catch(...)
-      {
-        printf("error: bad %s argument '%s'\n",name,arg.c_str());
-        return false;
-      }   
-    }
-    printf("error: missing argument for the %s option\n",name);
-    return false;
-  });
-}
-
-bool tryToParseFloatArg(const std::vector<std::string>& args,int* inout_argi,const char* name,float* out_value,bool* out_fail)
-{
-  return tryToParseArg(args,inout_argi,name,out_fail,[&]
-  {
-    int& argi = *inout_argi;
-    if (argi<args.size())
-    {
-      const std::string& arg = args[argi];
-      try
-      {
-        std::size_t pos = 0;
-        *out_value = std::stof(arg,&pos);
-        if (pos!=arg.size()) { printf("error: bad %s argument '%s'\n",name,arg.c_str()); return false; }
-        return true;
-      }
-      catch(...)
-      {
-        printf("error: bad %s argument '%s'\n",name,args[argi].c_str());
-        return false;
-      }   
-    }
-    printf("error: missing argument for the %s option\n",name);
-    return false;
-  });
-}
-
-bool tryToParseStringArg(const std::vector<std::string>& args,int* inout_argi,const char* name,std::string* out_value,bool* out_fail)
-{
-  return tryToParseArg(args,inout_argi,name,out_fail,[&]
-  {
-    int& argi = *inout_argi;
-    if (argi<args.size())
-    {
-      *out_value = args[argi];
-      return true;
-    }
-    printf("error: missing argument for the %s option\n",name);
-    return false;
-  });
-}
-
-bool tryToParseStringPairArg(const std::vector<std::string>& args,int* inout_argi,const char* name,std::pair<std::string,std::string>* out_value,bool* out_fail)
-{
-  return tryToParseArg(args,inout_argi,name,out_fail,[&]
-  {
-    int& argi = *inout_argi;
-    if ((argi+1)<args.size())
-    {
-      *out_value = std::make_pair(args[argi],args[argi+1]);
-      argi++;
-      return true;
-    }
-    printf("error: missing argument for the %s option\n",name);
-    return false;
-  });
-}
-
-#define STB_IMAGE_IMPLEMENTATION
-#include "stb_image.h"
-
-#define STB_IMAGE_WRITE_IMPLEMENTATION
-#include "stb_image_write.h"
-
-unsigned char* tryLoad(const std::string& fileName,int* width,int* height)
-{
-  unsigned char* data = stbi_load(fileName.c_str(),width,height,NULL,4);
-  if (data==NULL)
-  {
-    printf("error: failed to load '%s'\n",fileName.c_str());
-    printf("%s\n",stbi_failure_reason());
-    exit(1);
-  }
-  return data;
-}
-
-int evalNumChannels(const unsigned char* data,const int numPixels)
-{
-  bool isGray = true;
-  bool hasAlpha = false;
-
-  for(int xy=0;xy<numPixels;xy++)
-  {
-    const unsigned char r = data[xy*4+0];
-    const unsigned char g = data[xy*4+1];
-    const unsigned char b = data[xy*4+2];
-    const unsigned char a = data[xy*4+3];
-
-    if (!(r==g && g==b)) { isGray  = false; }
-    if (a<255)           { hasAlpha = true; }
-  }
-
-  const int numChannels = (isGray ? 1 : 3) + (hasAlpha ? 1 : 0);
-
-  return numChannels;
-}
-
-V2i pyramidLevelSize(const V2i& sizeBase,const int level)
-{
-  return V2i(V2f(sizeBase)*pow(2.0f,-float(level)));
-}
-
-int main(int argc,char** argv)
-{
-  if (argc<2)
-  {
-    printf("usage: %s [options]\n",argv[0]);
-    printf("\n");
-    printf("options:\n");
-    printf("  -style <style.png>\n");
-    printf("  -guide <source.png> <target.png>\n");
-    printf("  -output <output.png>\n");
-    printf("  -weight <value>\n");
-    printf("  -uniformity <value>\n");
-    printf("  -patchsize <size>\n");
-    printf("  -pyramidlevels <number>\n");
-    printf("  -searchvoteiters <number>\n");
-    printf("  -patchmatchiters <number>\n");
-    printf("  -stopthreshold <value>\n");
-    printf("\n");
-    return 1;
-  }
-
-  std::string styleFileName;
-  float       styleWeight = NAN;
-  std::string outputFileName = "output.png";
-
-  struct Guide
-  {
-    std::string    sourceFileName;
-    std::string    targetFileName;
-    float          weight;
-
-    int            sourceWidth;
-    int            sourceHeight;
-    unsigned char* sourceData;
-
-    int            targetWidth;
-    int            targetHeight;
-    unsigned char* targetData;
-    
-    int            numChannels;
-  };
-
-  std::vector<Guide> guides;
-
-  float uniformityWeight = 3500;
-  int patchSize = 5; 
-  int numPyramidLevels = -1;
-  int numSearchVoteIters = 6;
-  int numPatchMatchIters = 4;
-  int stopThreshold = 5;
-
-  std::string backend;
-
-  {
-    std::vector<std::string> args(argc);
-    for(int i=0;i<argc;i++) { args[i] = argv[i]; }
-  
-    bool fail = false;
-    int argi = 1;   
-
-    float* precedingStyleOrGuideWeight = 0;
-    while(argi<argc && !fail)
-    {
-      float weight;
-      std::pair<std::string,std::string> guidePair;
-      
-      if      (tryToParseStringArg(args,&argi,"-style",&styleFileName,&fail))
-      {
-        styleWeight = NAN;
-        precedingStyleOrGuideWeight = &styleWeight;
-        argi++;
-      }
-      else if (tryToParseStringPairArg(args,&argi,"-guide",&guidePair,&fail))
-      {
-        Guide guide;
-        guide.sourceFileName = guidePair.first;
-        guide.targetFileName = guidePair.second;
-        guide.weight = NAN;
-        guides.push_back(guide);
-        precedingStyleOrGuideWeight = &guides[guides.size()-1].weight;
-        argi++;
-      }
-      else if (tryToParseStringArg(args,&argi,"-output",&outputFileName,&fail))
-      {
-        argi++;
-      }
-      else if (tryToParseFloatArg(args,&argi,"-weight",&weight,&fail))
-      {
-        if (precedingStyleOrGuideWeight!=0) { *precedingStyleOrGuideWeight = weight; }
-        else { printf("error: at least one -style or -guide option must precede the -weight option!\n"); return 1; }
-        argi++;
-      }
-      else if (tryToParseFloatArg(args,&argi,"-uniformity",&uniformityWeight,&fail)) { argi++; }
-      else if (tryToParseIntArg(args,&argi,"-patchsize",&patchSize,&fail))
-      {
-        if (patchSize<3)    { printf("error: patchsize is too small!\n"); return 1; }
-        if (patchSize%2==0) { printf("error: patchsize must be an odd number!\n"); return 1; }
-        argi++;
-      }
-      else if (tryToParseIntArg(args,&argi,"-pyramidlevels",&numPyramidLevels,&fail))
-      {
-        if (numPyramidLevels<1) { printf("error: bad argument for -pyramidlevels!\n"); return 1; }
-        argi++;
-      }
-      else if (tryToParseIntArg(args,&argi,"-searchvoteiters",&numSearchVoteIters,&fail))
-      {
-        if (numSearchVoteIters<0) { printf("error: bad argument for -searchvoteiters!\n"); return 1; }
-        argi++;
-      }
-      else if (tryToParseIntArg(args,&argi,"-patchmatchiters",&numPatchMatchIters,&fail))
-      {
-        if (numPatchMatchIters<0) { printf("error: bad argument for -patchmatchiters!\n"); return 1; }
-        argi++;
-      }
-      else if (tryToParseIntArg(args,&argi,"-stopthreshold",&stopThreshold,&fail))
-      {
-        if (stopThreshold<0) { printf("error: bad argument for -stopthreshold!\n"); return 1; }
-        argi++;
-      }
-      else
-      {
-        printf("error: unrecognized option '%s'\n",args[argi].c_str());
-        fail = true;
-      }
-    }
-    
-    if (fail) { return 1; }
-  }
-
-  const int numGuides = guides.size();
-
-  int sourceWidth = 0;
-  int sourceHeight = 0;
-  unsigned char* sourceStyleData = tryLoad(styleFileName,&sourceWidth,&sourceHeight);
-  const int numStyleChannelsTotal = evalNumChannels(sourceStyleData,sourceWidth*sourceHeight);
-
-  std::vector<unsigned char> sourceStyle(sourceWidth*sourceHeight*numStyleChannelsTotal);
-  for(int xy=0;xy<sourceWidth*sourceHeight;xy++)
-  {
-    if      (numStyleChannelsTotal>0)  { sourceStyle[xy*numStyleChannelsTotal+0] = sourceStyleData[xy*4+0]; }
-    if      (numStyleChannelsTotal==2) { sourceStyle[xy*numStyleChannelsTotal+1] = sourceStyleData[xy*4+3]; }           
-    else if (numStyleChannelsTotal>1)  { sourceStyle[xy*numStyleChannelsTotal+1] = sourceStyleData[xy*4+1]; }
-    if      (numStyleChannelsTotal>2)  { sourceStyle[xy*numStyleChannelsTotal+2] = sourceStyleData[xy*4+2]; }
-    if      (numStyleChannelsTotal>3)  { sourceStyle[xy*numStyleChannelsTotal+3] = sourceStyleData[xy*4+3]; }                 
-  }
-  
-  int targetWidth = 0;
-  int targetHeight = 0;
-  int numGuideChannelsTotal = 0;
-
-  for(int i=0;i<numGuides;i++)
-  {
-    Guide& guide = guides[i];
-
-    guide.sourceData = tryLoad(guide.sourceFileName,&guide.sourceWidth,&guide.sourceHeight);
-    guide.targetData = tryLoad(guide.targetFileName,&guide.targetWidth,&guide.targetHeight);
-      
-    if              (guide.sourceWidth!=sourceWidth || guide.sourceHeight!=sourceHeight)  { printf("error: source guide '%s' doesn't match the resolution of '%s'\n",guide.sourceFileName.c_str(),styleFileName.c_str()); return 1; }      
-    if      (i>0 && (guide.targetWidth!=targetWidth || guide.targetHeight!=targetHeight)) { printf("error: target guide '%s' doesn't match the resolution of '%s'\n",guide.targetFileName.c_str(),guides[0].targetFileName.c_str()); return 1; }
-    else if (i==0) { targetWidth = guide.targetWidth; targetHeight = guide.targetHeight; }
-
-    guide.numChannels = std::max(evalNumChannels(guide.sourceData,sourceWidth*sourceHeight),
-                                 evalNumChannels(guide.targetData,targetWidth*targetHeight));    
-  
-    numGuideChannelsTotal += guide.numChannels;
-  }
-  
-  if (numStyleChannelsTotal>EBSYNTH_MAX_STYLE_CHANNELS) { printf("error: too many style channels (%d), maximum number is %d\n",numStyleChannelsTotal,EBSYNTH_MAX_STYLE_CHANNELS); return 1; }
-  if (numGuideChannelsTotal>EBSYNTH_MAX_GUIDE_CHANNELS) { printf("error: too many guide channels (%d), maximum number is %d\n",numGuideChannelsTotal,EBSYNTH_MAX_GUIDE_CHANNELS); return 1; }
-
-  std::vector<unsigned char> sourceGuides(sourceWidth*sourceHeight*numGuideChannelsTotal);
-  for(int xy=0;xy<sourceWidth*sourceHeight;xy++)
-  {
-    int c = 0;
-    for(int i=0;i<numGuides;i++)
-    { 
-      const int numChannels = guides[i].numChannels;  
-
-      if      (numChannels>0)  { sourceGuides[xy*numGuideChannelsTotal+c+0] = guides[i].sourceData[xy*4+0]; }
-      if      (numChannels==2) { sourceGuides[xy*numGuideChannelsTotal+c+1] = guides[i].sourceData[xy*4+3]; }           
-      else if (numChannels>1)  { sourceGuides[xy*numGuideChannelsTotal+c+1] = guides[i].sourceData[xy*4+1]; }
-      if      (numChannels>2)  { sourceGuides[xy*numGuideChannelsTotal+c+2] = guides[i].sourceData[xy*4+2]; }
-      if      (numChannels>3)  { sourceGuides[xy*numGuideChannelsTotal+c+3] = guides[i].sourceData[xy*4+3]; }            
-      
-      c += numChannels;
-    }
-  }
-
-  std::vector<unsigned char> targetGuides(targetWidth*targetHeight*numGuideChannelsTotal);
-  for(int xy=0;xy<targetWidth*targetHeight;xy++)
-  {
-    int c = 0;
-    for(int i=0;i<numGuides;i++)
-    { 
-      const int numChannels = guides[i].numChannels;  
-
-      if      (numChannels>0)  { targetGuides[xy*numGuideChannelsTotal+c+0] = guides[i].targetData[xy*4+0]; }
-      if      (numChannels==2) { targetGuides[xy*numGuideChannelsTotal+c+1] = guides[i].targetData[xy*4+3]; }           
-      else if (numChannels>1)  { targetGuides[xy*numGuideChannelsTotal+c+1] = guides[i].targetData[xy*4+1]; }
-      if      (numChannels>2)  { targetGuides[xy*numGuideChannelsTotal+c+2] = guides[i].targetData[xy*4+2]; }
-      if      (numChannels>3)  { targetGuides[xy*numGuideChannelsTotal+c+3] = guides[i].targetData[xy*4+3]; }            
-      
-      c += numChannels;
-    }
-  }
-
-  std::vector<float> styleWeights(numStyleChannelsTotal);
-  if (isnan(styleWeight)) { styleWeight = 1.0f; }
-  for(int i=0;i<numStyleChannelsTotal;i++) { styleWeights[i] = styleWeight / float(numStyleChannelsTotal); }
-
-  for(int i=0;i<numGuides;i++) { if (isnan(guides[i].weight)) { guides[i].weight = 1.0f/float(numGuides); } }
-
-  std::vector<float> guideWeights(numGuideChannelsTotal);
-  {
-    int c = 0;
-    for(int i=0;i<numGuides;i++)
-    { 
-      const int numChannels = guides[i].numChannels;  
-      
-      for(int j=0;j<numChannels;j++)
-      {
-        guideWeights[c+j] = guides[i].weight / float(numChannels);
-      }
-
-      c += numChannels; 
-    }
-  }
-
-  int maxPyramidLevels = 0;
-  for(int level=32;level>=0;level--)
-  {
-    if (min(pyramidLevelSize(std::min(V2i(sourceWidth,sourceHeight),V2i(targetWidth,targetHeight)),level)) >= (2*patchSize+1))
-    {
-      maxPyramidLevels = level+1;
-      break;
-    }
-  }
-
-  if (numPyramidLevels==-1) { numPyramidLevels = maxPyramidLevels; }
-  numPyramidLevels = std::min(numPyramidLevels,maxPyramidLevels); 
-
-  std::vector<int> numSearchVoteItersPerLevel(numPyramidLevels);
-  std::vector<int> numPatchMatchItersPerLevel(numPyramidLevels);
-  std::vector<int> stopThresholdPerLevel(numPyramidLevels);
-  for(int i=0;i<numPyramidLevels;i++)
-  {
-    numSearchVoteItersPerLevel[i] = numSearchVoteIters;
-    numPatchMatchItersPerLevel[i] = numPatchMatchIters;
-    stopThresholdPerLevel[i] = stopThreshold;
-  }
-
-  std::vector<unsigned char> output(targetWidth*targetHeight*numStyleChannelsTotal);
-
-  printf("uniformity: %.0f\n",uniformityWeight);
-  printf("patchsize: %d\n",patchSize);
-  printf("pyramidlevels: %d\n",numPyramidLevels);
-  printf("searchvoteiters: %d\n",numSearchVoteIters);
-  printf("patchmatchiters: %d\n",numPatchMatchIters);
-  printf("stopthreshold: %d\n",stopThreshold);
-
-  if (!ebsynthBackendAvailable(EBSYNTH_BACKEND_CUDA)) { printf("error: the CUDA backend is not available!\n"); return 1; }
-
-  ebsynthRun(EBSYNTH_BACKEND_CUDA,
-             numStyleChannelsTotal,
-             numGuideChannelsTotal,
-             sourceWidth,
-             sourceHeight,
-             sourceStyle.data(),
-             sourceGuides.data(),
-             targetWidth,
-             targetHeight,
-             targetGuides.data(),
-             NULL,
-             styleWeights.data(),
-             guideWeights.data(),
-             uniformityWeight,
-             patchSize,
-             EBSYNTH_VOTEMODE_PLAIN,
-             numPyramidLevels,
-             numSearchVoteItersPerLevel.data(),
-             numPatchMatchItersPerLevel.data(),
-             stopThresholdPerLevel.data(),
-             output.data());
-
-  stbi_write_png(outputFileName.c_str(),targetWidth,targetHeight,numStyleChannelsTotal,output.data(),numStyleChannelsTotal*targetWidth);
-
-  printf("result was written to %s\n",outputFileName.c_str());
-
-  stbi_image_free(sourceStyleData);
-
-  for(int i=0;i<numGuides;i++)
-  {
-    stbi_image_free(guides[i].sourceData);
-    stbi_image_free(guides[i].targetData);
-  }
-  
-  return 0;
-}
diff --git a/src/ebsynth_cuda.h b/src/ebsynth_cuda.h
new file mode 100644
index 0000000..4068aac
--- /dev/null
+++ b/src/ebsynth_cuda.h
@@ -0,0 +1,32 @@
+// This software is in the public domain. Where that dedication is not
+// recognized, you are granted a perpetual, irrevocable license to copy
+// and modify this file as you see fit.
+
+#ifndef EBSYNTH_CUDA_H_
+#define EBSYNTH_CUDA_H_
+
+void ebsynthRunCuda(int    numStyleChannels,
+                    int    numGuideChannels,
+                    int    sourceWidth,
+                    int    sourceHeight,
+                    void*  sourceStyleData,
+                    void*  sourceGuideData,
+                    int    targetWidth,
+                    int    targetHeight,
+                    void*  targetGuideData,
+                    void*  targetModulationData,
+                    float* styleWeights,
+                    float* guideWeights,
+                    float  uniformityWeight,
+                    int    patchSize,
+                    int    voteMode,
+                    int    numPyramidLevels,
+                    int*   numSearchVoteItersPerLevel,
+                    int*   numPatchMatchItersPerLevel,
+                    int*   stopThresholdPerLevel,
+                    void*  outputNnfData,
+                    void*  outputImageData);
+
+int ebsynthBackendAvailableCuda();
+
+#endif
diff --git a/src/cudacheck.h b/src/ebsynth_cuda_check.h
similarity index 86%
rename from src/cudacheck.h
rename to src/ebsynth_cuda_check.h
index 835aab7..9d83ed8 100644
--- a/src/cudacheck.h
+++ b/src/ebsynth_cuda_check.h
@@ -1,5 +1,5 @@
-#ifndef CUDACHECK_H_
-#define CUDACHECK_H_
+#ifndef EBSYNTH_CUDA_CHECK_H_
+#define EBSYNTH_CUDA_CHECK_H_
 
 template<typename T>
 bool checkCudaError_(T result,char const* const func,const char* const file,int const line)
diff --git a/src/memarray2.h b/src/ebsynth_cuda_memarray2.h
similarity index 93%
rename from src/memarray2.h
rename to src/ebsynth_cuda_memarray2.h
index 3c45bfd..8de0319 100644
--- a/src/memarray2.h
+++ b/src/ebsynth_cuda_memarray2.h
@@ -2,11 +2,11 @@
 // recognized, you are granted a perpetual, irrevocable license to copy
 // and modify this file as you see fit.
 
-#ifndef MEMARRAY2_H_
-#define MEMARRAY2_H_
+#ifndef EBSYNTH_CUDA_MEMARRAY2_H_
+#define EBSYNTH_CUDA_MEMARRAY2_H_
 
 #include "jzq.h"
-//#include "cudacheck.h"
+#include "ebsynth_cuda_check.h"
 
 template<typename T>
 struct MemArray2
diff --git a/src/texarray2.h b/src/ebsynth_cuda_texarray2.h
similarity index 98%
rename from src/texarray2.h
rename to src/ebsynth_cuda_texarray2.h
index de9678d..0427f77 100644
--- a/src/texarray2.h
+++ b/src/ebsynth_cuda_texarray2.h
@@ -2,11 +2,11 @@
 // recognized, you are granted a perpetual, irrevocable license to copy
 // and modify this file as you see fit.
 
-#ifndef TEXARRAY2_H_
-#define TEXARRAY2_H_
+#ifndef EBSYNTH_CUDA_TEXARRAY2_H_
+#define EBSYNTH_CUDA_TEXARRAY2_H_
 
 #include "jzq.h"
-#include "cudacheck.h"
+#include "ebsynth_cuda_check.h"
 
 #include <cuda_runtime.h>
 
diff --git a/src/ebsynth_nocuda.cpp b/src/ebsynth_nocuda.cpp
new file mode 100644
index 0000000..b87b64a
--- /dev/null
+++ b/src/ebsynth_nocuda.cpp
@@ -0,0 +1,33 @@
+// This software is in the public domain. Where that dedication is not
+// recognized, you are granted a perpetual, irrevocable license to copy
+// and modify this file as you see fit.
+
+void ebsynthRunCuda(int    numStyleChannels,
+                    int    numGuideChannels,
+                    int    sourceWidth,
+                    int    sourceHeight,
+                    void*  sourceStyleData,
+                    void*  sourceGuideData,
+                    int    targetWidth,
+                    int    targetHeight,
+                    void*  targetGuideData,
+                    void*  targetModulationData,
+                    float* styleWeights,
+                    float* guideWeights,
+                    float  uniformityWeight,
+                    int    patchSize,
+                    int    voteMode,
+                    int    numPyramidLevels,
+                    int*   numSearchVoteItersPerLevel,
+                    int*   numPatchMatchItersPerLevel,
+                    int*   stopThresholdPerLevel,
+                    void*  outputNnfData,
+                    void*  outputImageData)
+{
+
+}
+
+int ebsynthBackendAvailableCuda()
+{
+  return 0;
+}
diff --git a/src/jzq.h b/src/jzq.h
index 391fae9..507526d 100644
--- a/src/jzq.h
+++ b/src/jzq.h
@@ -1,1901 +1,1994 @@
-// This software is in the public domain. Where that dedication is not
-// recognized, you are granted a perpetual, irrevocable license to copy
-// and modify this file as you see fit.
-
-#ifndef JZQ_H_
-#define JZQ_H_
-
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstdarg>
-#include <vector>
-#include <string>
-#include <algorithm>
-
-template<typename T> struct zero { static __host__ __device__ T value(); };
-
-template<typename T> inline T clamp(const T& x,const T& xmin,const T& xmax);
-template<typename T> inline T lerp(const T& a,const T& b,const T& t);
-
-inline std::string spf(const std::string fmt,...);
-
-template<int N,typename T>
-struct Vec
-{
-  T v[N];
-
-  __host__ __device__ Vec<N,T>();
-  template<typename T2> __host__ __device__ explicit Vec<N,T>(const Vec<N,T2>& u);
-  explicit __host__ __device__ Vec<N,T>(T v0);
-
-  __host__ __device__ Vec<N,T>(T v0,T v1);
-  __host__ __device__ Vec<N,T>(T v0,T v1,T v2);
-  __host__ __device__ Vec<N,T>(T v0,T v1,T v2,T v3);
-  __host__ __device__ Vec<N,T>(T v0,T v1,T v2,T v3,T v4);
-  __host__ __device__ Vec<N,T>(T v0,T v1,T v2,T v3,T v4,T v5);
-
-  __host__ __device__ T&       operator()(int i);
-  __host__ __device__ const T& operator()(int i) const;
-  __host__ __device__ T&       operator[](int i);
-  __host__ __device__ const T& operator[](int i) const;
-
-  __host__ __device__ Vec<N,T> operator*=(const Vec<N,T>& u);
-  __host__ __device__ Vec<N,T> operator+=(const Vec<N,T>& u);
-
-  __host__ __device__ Vec<N,T> operator*=(T s);
-  __host__ __device__ Vec<N,T> operator+=(T s);
-};
-
-template<int N,typename T> Vec<N,T> __host__ __device__ operator-(const Vec<N,T>& u);
-template<int N,typename T> Vec<N,T> __host__ __device__ operator+(const Vec<N,T>& u,const Vec<N,T>& v);
-template<int N,typename T> Vec<N,T> __host__ __device__ operator-(const Vec<N,T>& u,const Vec<N,T>& v);
-template<int N,typename T> Vec<N,T> __host__ __device__ operator-(const Vec<N,T>& u,const T v);
-template<int N,typename T> Vec<N,T> __host__ __device__ operator*(const Vec<N,T>& u,const Vec<N,T>& v);
-template<int N,typename T> Vec<N,T> __host__ __device__ operator/(const Vec<N,T>& u,const Vec<N,T>& v);
-template<int N,typename T> Vec<N,T> __host__ __device__ operator*(const T s,const Vec<N,T>& u);
-template<int N,typename T> Vec<N,T> __host__ __device__ operator*(const Vec<N,T>& u,const T s);
-template<int N,typename T> Vec<N,T> __host__ __device__ operator/(const Vec<N,T>& u,const T s);
-
-template<int N,typename T> Vec<N,bool> __host__ __device__ operator<(const Vec<N,T>& u,const Vec<N,T>& v);
-template<int N,typename T> Vec<N,bool> __host__ __device__ operator>(const Vec<N,T>& u,const Vec<N,T>& v);
-template<int N,typename T> Vec<N,bool> __host__ __device__ operator<=(const Vec<N,T>& u,const Vec<N,T>& v);
-template<int N,typename T> Vec<N,bool> __host__ __device__ operator>=(const Vec<N,T>& u,const Vec<N,T>& v);
-template<int N,typename T> Vec<N,bool> __host__ __device__ operator==(const Vec<N,T>& u,const Vec<N,T>& v);
-template<int N,typename T> Vec<N,bool> __host__ __device__ operator!=(const Vec<N,T>& u,const Vec<N,T>& v);
-
-template<int N,typename T> inline T        dot(const Vec<N,T>& u,const Vec<N,T>& v);
-template<typename T>       inline T        cross(const Vec<2,T> &a,const Vec<2,T> &b);
-template<typename T>       inline Vec<3,T> cross(const Vec<3,T> &a,const Vec<3,T> &b);
-template<int N,typename T> inline T        norm(const Vec<N,T>& u);
-template<int N,typename T> inline Vec<N,T> normalize(const Vec<N,T>& u);
-template<int N,typename T> inline T        min(const Vec<N,T>& u);
-template<int N,typename T> inline T        max(const Vec<N,T>& u);
-template<int N,typename T> inline T        sum(const Vec<N,T>& u);
-namespace std
-{
-template<int N,typename T> inline Vec<N,T> min(const Vec<N,T>& u,const Vec<N,T>& v);
-template<int N,typename T> inline Vec<N,T> max(const Vec<N,T>& u,const Vec<N,T>& v);
-}
-template<int N,typename T> inline Vec<N,T> abs(const Vec<N,T>& x);
-
-template<int N>            inline bool     any(const Vec<N,bool>& u);
-template<int N>            inline bool     all(const Vec<N,bool>& u);
-
-template<int M,int N,typename T>
-struct Mat
-{
-  T m[M][N];
-
-  Mat<M,N,T>();
-
-  Mat<M,N,T>(T a00,T a01,
-             T a10,T a11);
-
-  Mat<M,N,T>(T a00,T a01,T a02,
-             T a10,T a11,T a12,
-             T a20,T a21,T a22);
-
-  Mat<M,N,T>(T a00,T a01,T a02,T a03,
-             T a10,T a11,T a12,T a13,
-             T a20,T a21,T a22,T a23,
-             T a30,T a31,T a32,T a33);
-
-  T&       operator()(int i,int j);
-  const T& operator()(int i,int j) const;
-
-  T*       data();
-  const T* data() const;
-};
-
-template<int M1,int N1,int M2,int N2,typename T> Mat<M1,N2,T> operator*(const Mat<M1,N1,T>& A,const Mat<M2,N2,T>& B);
-
-template<int M,int N,typename T> Vec<M,T> operator*(const Mat<M,N,T>& A,const Vec<N,T>& u);
-template<int M,int N,typename T> Vec<N,T> operator*(const Vec<M,T>& u,const Mat<M,N,T>& A);
-
-template<int M,int N,typename T> Mat<N,M,T> transpose(const Mat<M,N,T>& A);
-template<int N,typename T>       T          trace(const Mat<N,N,T>& A);
-template<int N,typename T>       Mat<N,N,T> inverse(const Mat<N,N,T>& A);
-
-template<typename T>
-class Array2
-{
-public:
-  Array2();
-  Array2(int width,int height);
-  explicit Array2(const Vec<2,int>& size);
-  Array2(const Array2<T>& a);
-  ~Array2();
-
-  Array2&  operator=(const Array2<T>& a);
-
-  inline T&       operator[](int i);
-  inline const T& operator[](int i) const;
-  inline T&       operator()(int i,int j);
-  inline const T& operator()(int i,int j) const;
-  inline T&       operator()(const Vec<2,int>& ij);
-  inline const T& operator()(const Vec<2,int>& ij) const;
-
-  Vec<2,int> size() const;
-  int        size(int dim) const;
-  int        width() const;
-  int        height() const;
-  int        numel() const;
-  T*         data();
-  const T*   data() const;
-  void       clear();
-  void       swap(Array2<T>& b);
-  bool       empty() const;
-
-private:
-  Vec<2,int> s;
-  T* d;
-};
-
-template<typename T> Vec<2,int> size(const Array2<T>& a);
-template<typename T> int        size(const Array2<T>& a,int dim);
-template<typename T> int        numel(const Array2<T>& a);
-template<typename T> void       clear(Array2<T>* a);
-template<typename T> void       swap(Array2<T>& a,Array2<T>& b);
-template<typename T> T          min(const Array2<T>& a);
-template<typename T> T          max(const Array2<T>& a);
-template<typename T> Vec<2,T>   minmax(const Array2<T>& a);
-template<typename T> Vec<2,int> argmin(const Array2<T>& a);
-template<typename T> Vec<2,int> argmax(const Array2<T>& a);
-template<typename T> T          sum(const Array2<T>& a);
-template<typename T> void       fill(Array2<T>* a,const T& value);
-
-template<typename T,typename F> Array2<T> apply(const Array2<T>& a,F fun);
-
-template<typename T>
-class Array3
-{
-public:
-  Array3();
-  explicit Array3(const Vec<3,int>& size);
-  Array3(int width,int height,int depth);
-  Array3(const Array3<T>& a);
-  ~Array3();
-
-  Array3& operator=(const Array3<T>& a);
-
-  inline T&       operator[](int i);
-  inline const T& operator[](int i) const;
-  inline T&       operator()(int i,int j,int k);
-  inline const T& operator()(int i,int j,int k) const;
-  inline T&       operator()(const Vec<3,int>& ijk);
-  inline const T& operator()(const Vec<3,int>& ijk) const;
-
-  Vec<3,int> size() const;
-  int        size(int dim) const;
-  int        width() const;
-  int        height() const;
-  int        depth() const;
-  int        numel() const;
-  T*         data();
-  const T*   data() const;
-  void       clear();
-  void       swap(Array3<T>& b);
-
-private:
-  Vec<3,int> s;
-  T* d;
-};
-
-template<typename T> Vec<3,int> size(const Array3<T>& a);
-template<typename T> int        size(const Array3<T>& a,int dim);
-template<typename T> int        numel(const Array3<T>& a);
-template<typename T> void       clear(Array3<T>* a);
-template<typename T> void       swap(Array3<T>& a,Array3<T>& b);
-
-typedef Vec<2,double>         Vec2d;
-typedef Vec<2,float>          Vec2f;
-typedef Vec<2,int>            Vec2i;
-typedef Vec<2,unsigned int>   Vec2ui;
-typedef Vec<2,short>          Vec2s;
-typedef Vec<2,unsigned short> Vec2us;
-typedef Vec<2,char>           Vec2c;
-typedef Vec<2,unsigned char>  Vec2uc;
-
-typedef Vec<3,double>         Vec3d;
-typedef Vec<3,float>          Vec3f;
-typedef Vec<3,int>            Vec3i;
-typedef Vec<3,unsigned int>   Vec3ui;
-typedef Vec<3,short>          Vec3s;
-typedef Vec<3,unsigned short> Vec3us;
-typedef Vec<3,char>           Vec3c;
-typedef Vec<3,unsigned char>  Vec3uc;
-
-typedef Vec<4,double>         Vec4d;
-typedef Vec<4,float>          Vec4f;
-typedef Vec<4,int>            Vec4i;
-typedef Vec<4,unsigned int>   Vec4ui;
-typedef Vec<4,short>          Vec4s;
-typedef Vec<4,unsigned short> Vec4us;
-typedef Vec<4,char>           Vec4c;
-typedef Vec<4,unsigned char>  Vec4uc;
-
-typedef Vec<5,double>         Vec5d;
-typedef Vec<5,float>          Vec5f;
-typedef Vec<5,int>            Vec5i;
-typedef Vec<5,unsigned int>   Vec5ui;
-typedef Vec<5,short>          Vec5s;
-typedef Vec<5,unsigned short> Vec5us;
-typedef Vec<5,char>           Vec5c;
-typedef Vec<5,unsigned char>  Vec5uc;
-
-typedef Vec<6,double>         Vec6d;
-typedef Vec<6,float>          Vec6f;
-typedef Vec<6,int>            Vec6i;
-typedef Vec<6,unsigned int>   Vec6ui;
-typedef Vec<6,short>          Vec6s;
-typedef Vec<6,unsigned short> Vec6us;
-typedef Vec<6,char>           Vec6c;
-typedef Vec<6,unsigned char>  Vec6uc;
-
-typedef Vec<2,double>         V2d;
-typedef Vec<2,float>          V2f;
-typedef Vec<2,int>            V2i;
-typedef Vec<2,unsigned int>   V2ui;
-typedef Vec<2,short>          V2s;
-typedef Vec<2,unsigned short> V2us;
-typedef Vec<2,char>           V2c;
-typedef Vec<2,unsigned char>  V2uc;
-
-typedef Vec<3,double>         V3d;
-typedef Vec<3,float>          V3f;
-typedef Vec<3,int>            V3i;
-typedef Vec<3,unsigned int>   V3ui;
-typedef Vec<3,short>          V3s;
-typedef Vec<3,unsigned short> V3us;
-typedef Vec<3,char>           V3c;
-typedef Vec<3,unsigned char>  V3uc;
-
-typedef Vec<4,double>         V4d;
-typedef Vec<4,float>          V4f;
-typedef Vec<4,int>            V4i;
-typedef Vec<4,unsigned int>   V4ui;
-typedef Vec<4,short>          V4s;
-typedef Vec<4,unsigned short> V4us;
-typedef Vec<4,char>           V4c;
-typedef Vec<4,unsigned char>  V4uc;
-
-typedef Vec<5,double>         V5d;
-typedef Vec<5,float>          V5f;
-typedef Vec<5,int>            V5i;
-typedef Vec<5,unsigned int>   V5ui;
-typedef Vec<5,short>          V5s;
-typedef Vec<5,unsigned short> V5us;
-typedef Vec<5,char>           V5c;
-typedef Vec<5,unsigned char>  V5uc;
-
-typedef Vec<6,double>         V6d;
-typedef Vec<6,float>          V6f;
-typedef Vec<6,int>            V6i;
-typedef Vec<6,unsigned int>   V6ui;
-typedef Vec<6,short>          V6s;
-typedef Vec<6,unsigned short> V6us;
-typedef Vec<6,char>           V6c;
-typedef Vec<6,unsigned char>  V6uc;
-
-typedef Mat<2,2,float> Mat2x2f;
-typedef Mat<2,3,float> Mat2x3f;
-typedef Mat<2,4,float> Mat2x4f;
-typedef Mat<2,5,float> Mat2x5f;
-typedef Mat<2,6,float> Mat2x6f;
-typedef Mat<2,7,float> Mat2x7f;
-typedef Mat<2,8,float> Mat2x8f;
-typedef Mat<3,2,float> Mat3x2f;
-typedef Mat<3,3,float> Mat3x3f;
-typedef Mat<3,4,float> Mat3x4f;
-typedef Mat<3,5,float> Mat3x5f;
-typedef Mat<3,6,float> Mat3x6f;
-typedef Mat<3,7,float> Mat3x7f;
-typedef Mat<3,8,float> Mat3x8f;
-typedef Mat<4,2,float> Mat4x2f;
-typedef Mat<4,3,float> Mat4x3f;
-typedef Mat<4,4,float> Mat4x4f;
-typedef Mat<4,5,float> Mat4x5f;
-typedef Mat<4,6,float> Mat4x6f;
-typedef Mat<4,7,float> Mat4x7f;
-typedef Mat<4,8,float> Mat4x8f;
-typedef Mat<5,2,float> Mat5x2f;
-typedef Mat<5,3,float> Mat5x3f;
-typedef Mat<5,4,float> Mat5x4f;
-typedef Mat<5,5,float> Mat5x5f;
-typedef Mat<5,6,float> Mat5x6f;
-typedef Mat<5,7,float> Mat5x7f;
-typedef Mat<5,8,float> Mat5x8f;
-typedef Mat<6,2,float> Mat6x2f;
-typedef Mat<6,3,float> Mat6x3f;
-typedef Mat<6,4,float> Mat6x4f;
-typedef Mat<6,5,float> Mat6x5f;
-typedef Mat<6,6,float> Mat6x6f;
-typedef Mat<6,7,float> Mat6x7f;
-typedef Mat<6,8,float> Mat6x8f;
-typedef Mat<7,2,float> Mat7x2f;
-typedef Mat<7,3,float> Mat7x3f;
-typedef Mat<7,4,float> Mat7x4f;
-typedef Mat<7,5,float> Mat7x5f;
-typedef Mat<7,6,float> Mat7x6f;
-typedef Mat<7,7,float> Mat7x7f;
-typedef Mat<7,8,float> Mat7x8f;
-typedef Mat<8,2,float> Mat8x2f;
-typedef Mat<8,3,float> Mat8x3f;
-typedef Mat<8,4,float> Mat8x4f;
-typedef Mat<8,5,float> Mat8x5f;
-typedef Mat<8,6,float> Mat8x6f;
-typedef Mat<8,7,float> Mat8x7f;
-typedef Mat<8,8,float> Mat8x8f;
-
-typedef Mat<2,2,double> Mat2x2d;
-typedef Mat<2,3,double> Mat2x3d;
-typedef Mat<2,4,double> Mat2x4d;
-typedef Mat<2,5,double> Mat2x5d;
-typedef Mat<2,6,double> Mat2x6d;
-typedef Mat<2,7,double> Mat2x7d;
-typedef Mat<2,8,double> Mat2x8d;
-typedef Mat<3,2,double> Mat3x2d;
-typedef Mat<3,3,double> Mat3x3d;
-typedef Mat<3,4,double> Mat3x4d;
-typedef Mat<3,5,double> Mat3x5d;
-typedef Mat<3,6,double> Mat3x6d;
-typedef Mat<3,7,double> Mat3x7d;
-typedef Mat<3,8,double> Mat3x8d;
-typedef Mat<4,2,double> Mat4x2d;
-typedef Mat<4,3,double> Mat4x3d;
-typedef Mat<4,4,double> Mat4x4d;
-typedef Mat<4,5,double> Mat4x5d;
-typedef Mat<4,6,double> Mat4x6d;
-typedef Mat<4,7,double> Mat4x7d;
-typedef Mat<4,8,double> Mat4x8d;
-typedef Mat<5,2,double> Mat5x2d;
-typedef Mat<5,3,double> Mat5x3d;
-typedef Mat<5,4,double> Mat5x4d;
-typedef Mat<5,5,double> Mat5x5d;
-typedef Mat<5,6,double> Mat5x6d;
-typedef Mat<5,7,double> Mat5x7d;
-typedef Mat<5,8,double> Mat5x8d;
-typedef Mat<6,2,double> Mat6x2d;
-typedef Mat<6,3,double> Mat6x3d;
-typedef Mat<6,4,double> Mat6x4d;
-typedef Mat<6,5,double> Mat6x5d;
-typedef Mat<6,6,double> Mat6x6d;
-typedef Mat<6,7,double> Mat6x7d;
-typedef Mat<6,8,double> Mat6x8d;
-typedef Mat<7,2,double> Mat7x2d;
-typedef Mat<7,3,double> Mat7x3d;
-typedef Mat<7,4,double> Mat7x4d;
-typedef Mat<7,5,double> Mat7x5d;
-typedef Mat<7,6,double> Mat7x6d;
-typedef Mat<7,7,double> Mat7x7d;
-typedef Mat<7,8,double> Mat7x8d;
-typedef Mat<8,2,double> Mat8x2d;
-typedef Mat<8,3,double> Mat8x3d;
-typedef Mat<8,4,double> Mat8x4d;
-typedef Mat<8,5,double> Mat8x5d;
-typedef Mat<8,6,double> Mat8x6d;
-typedef Mat<8,7,double> Mat8x7d;
-typedef Mat<8,8,double> Mat8x8d;
-
-typedef Array2<double>                  Array2d;
-typedef Array2<float>                   Array2f;
-typedef Array2<int>                     Array2i;
-typedef Array2<unsigned int>            Array2ui;
-typedef Array2<short>                   Array2s;
-typedef Array2<unsigned short>          Array2us;
-typedef Array2<char>                    Array2c;
-typedef Array2<unsigned char>           Array2uc;
-
-typedef Array2< Vec<2,double> >         Array2V2d;
-typedef Array2< Vec<2,float> >          Array2V2f;
-typedef Array2< Vec<2,int> >            Array2V2i;
-typedef Array2< Vec<2,unsigned int> >   Array2V2ui;
-typedef Array2< Vec<2,short> >          Array2V2s;
-typedef Array2< Vec<2,unsigned short> > Array2V2us;
-typedef Array2< Vec<2,char> >           Array2V2c;
-typedef Array2< Vec<2,unsigned char> >  Array2V2uc;
-
-typedef Array2< Vec<3,double> >         Array2V3d;
-typedef Array2< Vec<3,float> >          Array2V3f;
-typedef Array2< Vec<3,int> >            Array2V3i;
-typedef Array2< Vec<3,unsigned int> >   Array2V3ui;
-typedef Array2< Vec<3,short> >          Array2V3s;
-typedef Array2< Vec<3,unsigned short> > Array2V3us;
-typedef Array2< Vec<3,char> >           Array2V3c;
-typedef Array2< Vec<3,unsigned char> >  Array2V3uc;
-
-typedef Array2< Vec<4,double> >         Array2V4d;
-typedef Array2< Vec<4,float> >          Array2V4f;
-typedef Array2< Vec<4,int> >            Array2V4i;
-typedef Array2< Vec<4,unsigned int> >   Array2V4ui;
-typedef Array2< Vec<4,short> >          Array2V4s;
-typedef Array2< Vec<4,unsigned short> > Array2V4us;
-typedef Array2< Vec<4,char> >           Array2V4c;
-typedef Array2< Vec<4,unsigned char> >  Array2V4uc;
-
-typedef Array2<double>                  A2d;
-typedef Array2<float>                   A2f;
-typedef Array2<int>                     A2i;
-typedef Array2<unsigned int>            A2ui;
-typedef Array2<short>                   A2s;
-typedef Array2<unsigned short>          A2us;
-typedef Array2<char>                    A2c;
-typedef Array2<unsigned char>           A2uc;
-
-typedef Array2< Vec<2,double> >         A2V2d;
-typedef Array2< Vec<2,float> >          A2V2f;
-typedef Array2< Vec<2,int> >            A2V2i;
-typedef Array2< Vec<2,unsigned int> >   A2V2ui;
-typedef Array2< Vec<2,short> >          A2V2s;
-typedef Array2< Vec<2,unsigned short> > A2V2us;
-typedef Array2< Vec<2,char> >           A2V2c;
-typedef Array2< Vec<2,unsigned char> >  A2V2uc;
-
-typedef Array2< Vec<3,double> >         A2V3d;
-typedef Array2< Vec<3,float> >          A2V3f;
-typedef Array2< Vec<3,int> >            A2V3i;
-typedef Array2< Vec<3,unsigned int> >   A2V3ui;
-typedef Array2< Vec<3,short> >          A2V3s;
-typedef Array2< Vec<3,unsigned short> > A2V3us;
-typedef Array2< Vec<3,char> >           A2V3c;
-typedef Array2< Vec<3,unsigned char> >  A2V3uc;
-
-typedef Array2< Vec<4,double> >         A2V4d;
-typedef Array2< Vec<4,float> >          A2V4f;
-typedef Array2< Vec<4,int> >            A2V4i;
-typedef Array2< Vec<4,unsigned int> >   A2V4ui;
-typedef Array2< Vec<4,short> >          A2V4s;
-typedef Array2< Vec<4,unsigned short> > A2V4us;
-typedef Array2< Vec<4,char> >           A2V4c;
-typedef Array2< Vec<4,unsigned char> >  A2V4uc;
-
-typedef Array3<double>                  Array3d;
-typedef Array3<float>                   Array3f;
-typedef Array3<int>                     Array3i;
-typedef Array3<unsigned int>            Array3ui;
-typedef Array3<short>                   Array3s;
-typedef Array3<unsigned short>          Array3us;
-typedef Array3<char>                    Array3c;
-typedef Array3<unsigned char>           Array3uc;
-
-typedef Array3< Vec<2,double> >         Array3V2d;
-typedef Array3< Vec<2,float> >          Array3V2f;
-typedef Array3< Vec<2,int> >            Array3V2i;
-typedef Array3< Vec<2,unsigned int> >   Array3V2ui;
-typedef Array3< Vec<2,short> >          Array3V2s;
-typedef Array3< Vec<2,unsigned short> > Array3V2us;
-typedef Array3< Vec<2,char> >           Array3V2c;
-typedef Array3< Vec<2,unsigned char> >  Array3V2uc;
-
-typedef Array3< Vec<3,double> >         Array3V3d;
-typedef Array3< Vec<3,float> >          Array3V3f;
-typedef Array3< Vec<3,int> >            Array3V3i;
-typedef Array3< Vec<3,unsigned int> >   Array3V3ui;
-typedef Array3< Vec<3,short> >          Array3V3s;
-typedef Array3< Vec<3,unsigned short> > Array3V3us;
-typedef Array3< Vec<3,char> >           Array3V3c;
-typedef Array3< Vec<3,unsigned char> >  Array3V3uc;
-
-typedef Array3< Vec<4,double> >         Array3V4d;
-typedef Array3< Vec<4,float> >          Array3V4f;
-typedef Array3< Vec<4,int> >            Array3V4i;
-typedef Array3< Vec<4,unsigned int> >   Array3V4ui;
-typedef Array3< Vec<4,short> >          Array3V4s;
-typedef Array3< Vec<4,unsigned short> > Array3V4us;
-typedef Array3< Vec<4,char> >           Array3V4c;
-typedef Array3< Vec<4,unsigned char> >  Array3V4uc;
-
-typedef Array3<double>                  A3d;
-typedef Array3<float>                   A3f;
-typedef Array3<int>                     A3i;
-typedef Array3<unsigned int>            A3ui;
-typedef Array3<short>                   A3s;
-typedef Array3<unsigned short>          A3us;
-typedef Array3<char>                    A3c;
-typedef Array3<unsigned char>           A3uc;
-
-typedef Array3< Vec<2,double> >         A3V2d;
-typedef Array3< Vec<2,float> >          A3V2f;
-typedef Array3< Vec<2,int> >            A3V2i;
-typedef Array3< Vec<2,unsigned int> >   A3V2ui;
-typedef Array3< Vec<2,short> >          A3V2s;
-typedef Array3< Vec<2,unsigned short> > A3V2us;
-typedef Array3< Vec<2,char> >           A3V2c;
-typedef Array3< Vec<2,unsigned char> >  A3V2uc;
-
-typedef Array3< Vec<3,double> >         A3V3d;
-typedef Array3< Vec<3,float> >          A3V3f;
-typedef Array3< Vec<3,int> >            A3V3i;
-typedef Array3< Vec<3,unsigned int> >   A3V3ui;
-typedef Array3< Vec<3,short> >          A3V3s;
-typedef Array3< Vec<3,unsigned short> > A3V3us;
-typedef Array3< Vec<3,char> >           A3V3c;
-typedef Array3< Vec<3,unsigned char> >  A3V3uc;
-
-typedef Array3< Vec<4,double> >         A3V4d;
-typedef Array3< Vec<4,float> >          A3V4f;
-typedef Array3< Vec<4,int> >            A3V4i;
-typedef Array3< Vec<4,unsigned int> >   A3V4ui;
-typedef Array3< Vec<4,short> >          A3V4s;
-typedef Array3< Vec<4,unsigned short> > A3V4us;
-typedef Array3< Vec<4,char> >           A3V4c;
-typedef Array3< Vec<4,unsigned char> >  A3V4uc;
-
-template<> struct zero<char          > { static __host__ __device__ char           value() { return 0;    } };
-template<> struct zero<unsigned char > { static __host__ __device__ unsigned char  value() { return 0;    } };
-template<> struct zero<short         > { static __host__ __device__ short          value() { return 0;    } };
-template<> struct zero<unsigned short> { static __host__ __device__ unsigned short value() { return 0;    } };
-template<> struct zero<int           > { static __host__ __device__ int            value() { return 0;    } };
-template<> struct zero<unsigned int  > { static __host__ __device__ unsigned int   value() { return 0;    } };
-template<> struct zero<float         > { static __host__ __device__ float          value() { return 0.0f; } };
-template<> struct zero<double        > { static __host__ __device__ double         value() { return 0.0;  } };
-
-template<int N,typename T>
-struct zero<Vec<N,T>>
-{
-  static __host__ __device__ Vec<N,T> value()
-  {
-    Vec<N,T> z;
-    for(int i=0;i<N;i++) { z[i] = zero<T>::value(); }
-    return z;
-  }
-};
-
-template<int M,int N,typename T>
-struct zero<Mat<M,N,T>>
-{
-  static __host__ __device__ Mat<M,N,T> value()
-  {
-    Mat<M,N,T> z;
-    for(int i=0;i<M;i++)
-    for(int j=0;j<N;j++)
-    {
-      z(i,j) = zero<T>::value();
-    }
-    return z;
-  }
-};
-
-template <typename T> inline
-T clamp(const T& x,const T& xmin,const T& xmax)
-{
-  return std::min(std::max(x,xmin),xmax);
-}
-
-template <typename T> inline
-T lerp(const T& a,const T& b,const T& t)
-{
-  return (1.0-t)*a+t*b;
-}
-
-inline std::string spf(const std::string fmt,...)
-{
-  int size = 1024;
-  std::vector<char> buf;
-  va_list ap;
-
-  while(1)
-  {
-    if(size>16*1024*1024) { return std::string(""); }
-
-    buf.resize(size);
-
-    va_start(ap,fmt);
-    const int n = vsnprintf(&buf[0],size-1,fmt.c_str(),ap);
-    va_end(ap);
-
-    if(n>-1 && n < size)
-    {
-      break;
-    }
-    else if(n>-1)
-    {
-      size = n + 1;
-    }
-    else
-    {
-      size = 2*size;
-    }
-  }
-
-  return std::string(&buf[0]);
-}
-
-template<int N,typename T>
-__host__ __device__
-Vec<N,T>::Vec()
-{
-}
-
-template<int N,typename T>
-__host__ __device__
-Vec<N,T>::Vec(T v0)
-{
-  assert(N==1);
-  v[0]=v0;
-}
-
-template<int N,typename T>
-__host__ __device__
-Vec<N,T>::Vec(T v0,T v1)
-{
-  assert(N==2);
-  v[0]=v0; v[1]=v1;
-}
-
-template<int N,typename T>
-__host__ __device__
-Vec<N,T>::Vec(T v0,T v1,T v2)
-{
-  assert(N==3);
-  v[0]=v0; v[1]=v1; v[2]=v2;
-}
-
-template<int N,typename T>
-__host__ __device__
-Vec<N,T>::Vec(T v0,T v1,T v2,T v3)
-{
-  assert(N==4);
-  v[0]=v0; v[1]=v1; v[2]=v2; v[3]=v3;
-}
-
-template<int N,typename T>
-__host__ __device__
-Vec<N,T>::Vec(T v0,T v1,T v2,T v3,T v4)
-{
-  assert(N==5);
-  v[0]=v0; v[1]=v1; v[2]=v2; v[3]=v3; v[4]=v4;
-}
-
-template<int N,typename T>
-__host__ __device__
-Vec<N,T>::Vec(T v0,T v1,T v2,T v3,T v4,T v5)
-{
-  assert(N==6);
-  v[0]=v0; v[1]=v1; v[2]=v2; v[3]=v3; v[4]=v4; v[5]=v5;
-}
-
-template<int N,typename T> template<typename T2>
-__host__ __device__
-Vec<N,T>::Vec(const Vec<N,T2>& u)
-{
-  for(int i=0;i<N;i++)
-  {
-    v[i] = static_cast<T>(u.v[i]);
-  }
-}
-
-template<int N,typename T>
-__host__ __device__
-T& Vec<N,T>::operator()(int i)
-{
-  assert(i>=0 && i<N);
-  return v[i];
-}
-
-template<int N,typename T>
-__host__ __device__
-const T& Vec<N,T>::operator()(int i) const
-{
-  assert(i>=0 && i<N);
-  return v[i];
-}
-
-template<int N,typename T>
-__host__ __device__
-T& Vec<N,T>::operator[](int i)
-{
-  assert(i>=0 && i<N);
-  return v[i];
-}
-
-template<int N,typename T>
-__host__ __device__
-const T& Vec<N,T>::operator[](int i) const
-{
-  assert(i>=0 && i<N);
-  return v[i];
-}
-
-template<int N,typename T>
-__host__ __device__
-Vec<N,T> Vec<N,T>::operator*=(const Vec<N,T>& u)
-{
-  for(int i=0;i<N;i++) v[i]*=u(i);
-  return *this;
-}
-
-template<int N,typename T>
-__host__ __device__
-Vec<N,T> Vec<N,T>::operator+=(const Vec<N,T>& u)
-{
-  for(int i=0;i<N;i++) v[i]+=u(i);
-  return *this;
-}
-
-template<int N,typename T>
-__host__ __device__
-Vec<N,T> Vec<N,T>::operator*=(T s)
-{
-  for(int i=0;i<N;i++) v[i]*=s;
-  return *this;
-}
-
-template<int N,typename T>
-__host__ __device__
-Vec<N,T> Vec<N,T>::operator+=(T s)
-{
-  for(int i=0;i<N;i++) v[i]+=s;
-  return *this;
-}
-
-template<int N,typename T>
-__host__ __device__
-Vec<N,T> operator-(const Vec<N,T>& u)
-{
-  Vec<N,T> r;
-  for(int i=0;i<N;i++) r(i)=-u(i);
-  return r;
-}
-
-template<int N,typename T>
-__host__ __device__
-Vec<N,T> operator+(const Vec<N,T>& u,const Vec<N,T>& v)
-{
-  Vec<N,T> r;
-  for(int i=0;i<N;i++) r(i)=u(i)+v(i);
-  return r;
-}
-
-template<int N,typename T>
-__host__ __device__
-Vec<N,T> operator-(const Vec<N,T>& u,const Vec<N,T>& v)
-{
-  Vec<N,T> r;
-  for(int i=0;i<N;i++) r(i)=u(i)-v(i);
-  return r;
-}
-
-template<int N,typename T>
-__host__ __device__
-Vec<N,T> operator-(const Vec<N,T>& u,const T v)
-{
-  Vec<N,T> r;
-  for(int i=0;i<N;i++) r(i)=u(i)-v;
-  return r;
-}
-
-template<int N,typename T>
-__host__ __device__
-Vec<N,T> operator*(const Vec<N,T>& u,const Vec<N,T>& v)
-{
-  Vec<N,T> r;
-  for(int i=0;i<N;i++) r(i)=u(i)*v(i);
-  return r;
-}
-
-template<int N,typename T>
-__host__ __device__
-Vec<N,T> operator/(const Vec<N,T>& u,const Vec<N,T>& v)
-{
-  Vec<N,T> r;
-  for(int i=0;i<N;i++) r(i)=u(i)/v(i);
-  return r;
-}
-
-template<int N,typename T>
-__host__ __device__
-Vec<N,T> operator*(const T s,const Vec<N,T>& u)
-{
-  Vec<N,T> r;
-  for(int i=0;i<N;i++) r(i)=s*u(i);
-  return r;
-}
-
-template<int N,typename T>
-__host__ __device__
-Vec<N,T> operator*(const Vec<N,T>& u,const T s)
-{
-  Vec<N,T> r;
-  for(int i=0;i<N;i++) r(i)=u(i)*s;
-  return r;
-}
-
-template<int N,typename T>
-__host__ __device__
-Vec<N,T> operator/(const Vec<N,T>& u,const T s)
-{
-  Vec<N,T> r;
-  for(int i=0;i<N;i++) r(i)=u(i)/s;
-  return r;
-}
-
-template<int N,typename T>
-__host__ __device__
-Vec<N,bool> operator<(const Vec<N,T>& u,const Vec<N,T>& v)
-{
-  Vec<N,bool> r;
-  for(int i=0;i<N;i++) r(i)=u(i)<v(i);
-  return r;
-}
-
-template<int N,typename T>
-__host__ __device__
-Vec<N,bool> operator>(const Vec<N,T>& u,const Vec<N,T>& v)
-{
-  Vec<N,bool> r;
-  for(int i=0;i<N;i++) r(i)=u(i)>v(i);
-  return r;
-}
-
-template<int N,typename T>
-__host__ __device__
-Vec<N,bool> operator<=(const Vec<N,T>& u,const Vec<N,T>& v)
-{
-  Vec<N,bool> r;
-  for(int i=0;i<N;i++) r(i)=u(i)<=v(i);
-  return r;
-}
-
-template<int N,typename T>
-__host__ __device__
-Vec<N,bool> operator>=(const Vec<N,T>& u,const Vec<N,T>& v)
-{
-  Vec<N,bool> r;
-  for(int i=0;i<N;i++) r(i)=u(i)>=v(i);
-  return r;
-}
-
-template<int N,typename T>
-__host__ __device__
-Vec<N,bool> operator==(const Vec<N,T>& u,const Vec<N,T>& v)
-{
-  Vec<N,bool> r;
-  for(int i=0;i<N;i++) r(i) = (u(i)==v(i));
-  return r;
-}
-
-template<int N,typename T>
-__host__ __device__
-Vec<N,bool> operator!=(const Vec<N,T>& u,const Vec<N,T>& v)
-{
-  Vec<N,bool> r;
-  for(int i=0;i<N;i++) r(i) = (u(i)!=v(i));
-  return r;
-}
-
-template<int N,typename T>
-inline T dot(const Vec<N,T>& u,const Vec<N,T>& v)
-{
-  assert(N>0);
-  T sumprod = u(0)*v(0);
-  for(int i=1;i<N;i++) sumprod += u(i)*v(i);
-  return sumprod;
-}
-
-template<typename T>
-inline T cross(const Vec<2,T> &a,const Vec<2,T> &b)
-{
-  return a[0]*b[1]-a[1]*b[0];
-}
-
-template<typename T>
-inline Vec<3,T> cross(const Vec<3,T> &a,const Vec<3,T> &b)
-{
-  return Vec<3,T>(a[1]*b[2]-a[2]*b[1],
-                  a[2]*b[0]-a[0]*b[2],
-                  a[0]*b[1]-a[1]*b[0]);
-}
-
-template<int N,typename T>
-inline T norm(const Vec<N,T>& u)
-{
-  return std::sqrt(dot(u,u));
-}
-
-template<int N,typename T>
-inline Vec<N,T> normalize(const Vec<N,T>& u)
-{
-  return u/norm(u);
-}
-
-template<int N>
-inline bool any(const Vec<N,bool>& u)
-{
-  for(int i=0;i<N;i++)
-  {
-    if (u(i)==true) return true;
-  }
-  return false;
-}
-
-template<int N>
-inline bool all(const Vec<N,bool>& u)
-{
-  for(int i=0;i<N;i++)
-  {
-    if (u(i)==false) return false;
-  }
-  return true;
-}
-
-template<int N,typename T>
-inline T min(const Vec<N,T>& u)
-{
-  assert(N>0);
-
-  T minval = u(0);
-
-  for(int i=1;i<N;i++)
-  {
-   if (u(i) < minval)
-   {
-      minval = u(i);
-   }
-  }
-
-  return minval;
-}
-
-template<int N,typename T>
-inline T max(const Vec<N,T>& u)
-{
-  assert(N>0);
-
-  T maxval = u(0);
-
-  for(int i=1;i<N;i++)
-  {
-   if (u(i) > maxval)
-   {
-      maxval = u(i);
-   }
-  }
-
-  return maxval;
-}
-
-template<int N,typename T>
-inline T sum(const Vec<N,T>& u)
-{
-  assert(N>0);
-
-  T sumval = u(0);
-
-  for(int i=1;i<N;i++)
-  {
-    sumval += u(i);
-  }
-
-  return sumval;
-}
-
-
-namespace std
-{
-template<int N,typename T> Vec<N,T>
-inline min(const Vec<N,T>& u,const Vec<N,T>& v)
-{
-  assert(N>0);
-
-  Vec<N,T> w;
-
-  for(int i=0;i<N;i++)
-  {
-    w(i) = min(u(i),v(i));
-  }
-
-  return w;
-}
-
-template<int N,typename T> Vec<N,T>
-inline max(const Vec<N,T>& u,const Vec<N,T>& v)
-{
-  assert(N>0);
-
-  Vec<N,T> w;
-
-  for(int i=0;i<N;i++)
-  {
-    w(i) = max(u(i),v(i));
-  }
-
-  return w;
-}
-}
-
-template<int N,typename T> Vec<N,T>
-inline abs(const Vec<N,T>& x)
-{
-  Vec<N,T> out;
-  for(int i=0;i<N;i++) out(i) = abs(x(i));
-  return out;
-}
-
-#define fori(I) for (int i=0;i<(I);i++)
-#define forj(J) for (int j=0;j<(J);j++)
-#define fork(K) for (int k=0;k<(K);k++)
-#define forij(I,J) for (int i=0;i<(I);i++) for (int j=0;j<(J);j++)
-
-template<int M,int N,typename T>
-Mat<M,N,T>::Mat() {}
-
-template<int M,int N,typename T>
-Mat<M,N,T>::Mat(T a00,T a01,
-                T a10,T a11)
-{
-  assert(M==2 && N==2);
-
-  m[0][0] = a00; m[0][1] = a01;
-  m[1][0] = a10; m[1][1] = a11;
-}
-
-template<int M,int N,typename T>
-Mat<M,N,T>::Mat(T a00,T a01,T a02,
-                T a10,T a11,T a12,
-                T a20,T a21,T a22)
-{
-  assert(M==3 && N==3);
-
-  m[0][0] = a00; m[0][1] = a01; m[0][2] = a02;
-  m[1][0] = a10; m[1][1] = a11; m[1][2] = a12;
-  m[2][0] = a20; m[2][1] = a21; m[2][2] = a22;
-}
-
-template<int M,int N,typename T>
-Mat<M,N,T>::Mat(T a00,T a01,T a02,T a03,
-                T a10,T a11,T a12,T a13,
-                T a20,T a21,T a22,T a23,
-                T a30,T a31,T a32,T a33)
-{
-  assert(M==4 && N==4);
-
-  m[0][0] = a00; m[0][1] = a01; m[0][2] = a02; m[0][3] = a03;
-  m[1][0] = a10; m[1][1] = a11; m[1][2] = a12; m[1][3] = a13;
-  m[2][0] = a20; m[2][1] = a21; m[2][2] = a22; m[2][3] = a23;
-  m[3][0] = a30; m[3][1] = a31; m[3][2] = a32; m[3][3] = a33;
-}
-
-template<int M,int N,typename T>
-T& Mat<M,N,T>::operator()(int i,int j)
-{
-  assert(0<=i && i<M);
-  assert(0<=j && j<N);
-  return m[i][j];
-}
-
-template<int M,int N,typename T>
-const T& Mat<M,N,T>::operator()(int i,int j) const
-{
-  assert(0<=i && i<M);
-  assert(0<=j && j<N);
-  return m[i][j];
-}
-
-template<int M,int N,typename T>
-T* Mat<M,N,T>::data()
-{
-  return (T*)(&m[0][0]);
-}
-
-template<int M,int N,typename T>
-const T* Mat<M,N,T>::data() const
-{
-  return (T*)(&m[0][0]);
-}
-
-template<int M1,int N1,int M2,int N2,typename T>
-Mat<M1,N2,T> operator*(const Mat<M1,N1,T>& A,const Mat<M2,N2,T>& B)
-{
-  assert(N1==M2);
-  Mat<M1,N2,T> C;
-
-  fori(M1)
-  forj(N2)
-  {
-    T dot = 0;
-    fork(N1) dot += A(i,k) * B(k,j);
-    C(i,j) = dot;
-  }
-
-  return C;
-}
-
-template<int M,int N,typename T>
-Vec<M,T> operator*(const Mat<M,N,T>& A,const Vec<N,T>& u)
-{
-  Vec<M,T> v;
-
-  fori(M)
-  {
-    T dot = 0;
-    forj(N) dot += A(i,j) * u(j);
-    v(i) = dot;
-  }
-
-  return v;
-}
-
-template<int M,int N,typename T>
-Vec<N,T> operator*(const Vec<M,T>& u,const Mat<M,N,T>& A)
-{
-  Vec<N,T> v;
-
-  forj(N)
-  {
-    T dot = 0;
-    fori(M) dot += A(i,j) * u(i);
-    v(j) = dot;
-  }
-
-  return v;
-}
-
-/*
-template<int N, class T>
-Mat<N,N,T> identity()
-{
-    Mat<N,N,T> A;
-    forij(N,N) A(i,j) = ((i==j) ? 1 : 0);
-    return A;
-}
-*/
-
-template<int M,int N,typename T>
-Mat<N,M,T> transpose(const Mat<M,N,T>& A)
-{
-  Mat<N,M,T> At;
-
-  forij(N,M) At(i,j) = A(j,i);
-
-  return At;
-}
-
-template<int N,typename T>
-T trace(const Mat<N,N,T>& A)
-{
-  T sum = 0;
-
-  fori(N) sum += A(i,i);
-
-  return sum;
-}
-
-template<int N,typename T>
-Mat<N,N,T> inverse(const Mat<N,N,T>& A)
-{
-  Mat<N,N,T> invA;
-
-  invA = A;
-
-  Vec<N,int> colIndex;
-  Vec<N,int> rowIndex;
-  Vec<N,bool> pivoted;
-
-  fori(N) pivoted(i) = false;
-
-  int i1, i2, row = 0, col = 0;
-  T save;
-
-  for (int i0 = 0; i0 < N; i0++)
-  {
-    T fMax = 0.0f;
-    for (i1 = 0; i1 < N; i1++)
-    {
-      if (!pivoted(i1))
-      {
-        for (i2 = 0; i2 < N; i2++)
-        {
-          if (!pivoted(i2))
-          {
-            T fs = abs(invA(i1,i2));
-            if (fs > fMax)
-            {
-              fMax = fs;
-              row = i1;
-              col = i2;
-            }
-          }
-        }
-      }
-    }
-
-    //assert(fmax > eps)
-
-    pivoted(col) = true;
-
-    if (row != col)
-    {
-        forj(N) { T tmp = invA(row,j); invA(row,j) = invA(col,j); invA(col,j) = tmp; }
-    }
-
-    rowIndex(i0) = row;
-    colIndex(i0) = col;
-
-    T inv = ((T)1.0)/invA(col,col);
-    invA(col,col) = (T)1.0;
-    for (i2 = 0; i2 < N; i2++)
-    {
-      invA(col,i2) *= inv;
-    }
-
-    for (i1 = 0; i1 < N; i1++)
-    {
-      if (i1 != col)
-      {
-        save = invA(i1,col);
-        invA(i1,col) = (T)0.0;
-        for (i2 = 0; i2 < N; i2++)
-        {
-          invA(i1,i2) -= invA(col,i2)*save;
-        }
-      }
-    }
-  }
-
-  for (i1 = N-1; i1 >= 0; i1--)
-  {
-    if (rowIndex(i1) != colIndex(i1))
-    {
-      for (i2 = 0; i2 < N; i2++)
-      {
-        save = invA(i2,rowIndex(i1));
-        invA(i2,rowIndex(i1)) = invA(i2,colIndex(i1));
-        invA(i2,colIndex(i1)) = save;
-      }
-    }
-  }
-
-  return invA;
-}
-
-#undef fori
-#undef forj
-#undef fork
-
-template<typename T>
-Array2<T>::Array2() : s(0,0),d(0) {}
-
-template<typename T>
-Array2<T>::Array2(int width,int height)
-{
-  assert(width>0 && height>0);
-  s = Vec2i(width,height);
-  d = new T[s(0)*s(1)];
-}
-
-template<typename T>
-Array2<T>::Array2(const Vec2i& size)
-{
-  // XXX: predelat na neco jako assert(all(s>0));
-  assert(size(0)>0 && size(1)>0);
-  s = size;
-  d = new T[s(0)*s(1)];
-}
-
-template<typename T>
-Array2<T>::Array2(const Array2<T>& a)
-{
-  //  printf("COPY CONSTRUCTOR\n");
-  s = a.s;
-
-  if (s(0)>0 && s(1)>0)
-  {
-    d = new T[s(0)*s(1)];
-
-    // XXX: optimize this:
-    for(int i=0;i<s(0)*s(1);i++) d[i] = a.d[i];
-  }
-  else
-  {
-    d = 0;
-  }
-}
-
-template<typename T>
-Array2<T>& Array2<T>::operator=(const Array2<T>& a)
-{
-  // printf("ASSIGNMENT\n");
-  // printf("slow copy\n");
-  if (this!=&a)
-  {
-    if (s(0)==a.s(0) && s(1)==a.s(1))
-    {
-      // XXX: optimize this:
-      for(int i=0;i<s(0)*s(1);i++) d[i] = a.d[i];
-      //memcpy(d,a.d,numel()*sizeof(T)); //XXX this will break down when T is not POD !!!
-    }
-    else
-    {
-      delete[] d;
-      s = a.s;
-
-      if (a.s(0)>0 && a.s(1)>0)
-      {
-        d = new T[s(0)*s(1)];
-        //memcpy(d,a.d,numel()*sizeof(T)); //XXX this will break down when T is not POD !!!
-        // XXX: optimize this:
-        for(int i=0;i<s(0)*s(1);i++) d[i] = a.d[i];
-      }
-      else
-      {
-        d = 0;
-      }
-    }
-  }
-  else
-  {
-  //  printf("SELF ASSIGNMENT\n");
-  }
-
-  return *this;
-}
-
-template<typename T>
-Array2<T>::~Array2()
-{
-  delete[] d;
-}
-
-template<typename T>
-inline T& Array2<T>::operator[](int i)
-{
-  assert(i>=0 && i<numel());
-
-  return d[i];
-}
-
-template<typename T>
-inline const T& Array2<T>::operator[](int i) const
-{
-  assert(i>=0 && i<numel());
-
-  return d[i];
-}
-
-template<typename T>
-inline T& Array2<T>::operator()(int i,int j)
-{
-  assert(d!=0);
-  assert(i>=0 && i<s(0) &&
-         j>=0 && j<s(1));
-
-  return d[i+j*s(0)];
-}
-
-template<typename T>
-inline const T& Array2<T>::operator()(int i,int j) const
-{
-  assert(d!=0);
-  assert(i>=0 && i<s(0) &&
-         j>=0 && j<s(1));
-
-  return d[i+j*s(0)];
-}
-
-template<typename T>
-inline T& Array2<T>::operator()(const Vec<2,int>& ij)
-{
-  assert(d!=0);
-  assert(ij(0)>=0 && ij(0)<s(0) &&
-         ij(1)>=0 && ij(1)<s(1));
-
-  return d[ij(0)+ij(1)*s(0)];
-}
-
-template<typename T>
-inline const T& Array2<T>::operator()(const Vec<2,int>& ij) const
-{
-  assert(d!=0);
-  assert(ij(0)>=0 && ij(0)<s(0) &&
-         ij(1)>=0 && ij(1)<s(1));
-
-  return d[ij(0)+ij(1)*s(0)];
-}
-
-template<typename T>
-Vec2i Array2<T>::size() const
-{
-  return s;
-}
-
-template<typename T>
-int Array2<T>::size(int dim) const
-{
-  assert(dim==0 || dim==1);
-  return size()(dim);
-}
-
-template<typename T>
-int Array2<T>::width() const
-{
-  return size(0);
-}
-
-template<typename T>
-int Array2<T>::height() const
-{
-  return size(1);
-}
-
-template<typename T>
-int Array2<T>::numel() const
-{
-  return size(0)*size(1);
-}
-
-template<typename T>
-T* Array2<T>::data()
-{
-  return d;
-}
-
-template<typename T>
-const T* Array2<T>::data() const
-{
-  return d;
-}
-
-template<typename T>
-bool Array2<T>::empty() const
-{
-  return (numel()==0);
-}
-
-template<typename T>
-void Array2<T>::clear()
-{
-  delete[] d;
-  s = Vec2i(0,0);
-  d = 0;
-}
-
-template<typename T>
-void Array2<T>::swap(Array2<T>& b)
-{
-  Vec2i tmp_s = s;
-  s = b.s;
-  b.s = tmp_s;
-
-  T* tmp_d = d;
-  d = b.d;
-  b.d = tmp_d;
-}
-
-template<typename T>
-Vec2i size(const Array2<T>& a)
-{
-  return a.size();
-}
-
-template<typename T>
-int size(const Array2<T>& a,int dim)
-{
-  return a.size(dim);
-}
-
-template<typename T>
-int numel(const Array2<T>& a)
-{
-  return a.numel();
-}
-
-template<typename T>
-void clear(Array2<T>* a)
-{
-  a->clear();
-}
-
-template<typename T>
-void swap(Array2<T>& a,Array2<T>& b)
-{
-  a.swap(b);
-}
-
-template<typename T>
-T min(const Array2<T>& a)
-{
-  assert(numel(a)>0);
-
-  const int n = numel(a);
-
-  const T* d = a.data();
-
-  T minval = d[0];
-
-  for(int i=1;i<n;i++) minval = (d[i]<minval) ? d[i] : minval;
-
-  return minval;
-}
-
-template<typename T>
-T max(const Array2<T>& a)
-{
-  assert(numel(a)>0);
-
-  const int n = numel(a);
-
-  const T* d = a.data();
-
-  T maxval = d[0];
-
-  for(int i=1;i<n;i++) maxval = (maxval<d[i]) ? d[i] : maxval;
-
-  return maxval;
-}
-
-template<typename T>
-Vec<2,T> minmax(const Array2<T>& a)
-{
-  assert(numel(a)>0);
-
-  const int n = numel(a);
-
-  const T* d = a.data();
-
-  T minval = d[0];
-  T maxval = d[0];
-
-  for(int i=1;i<n;i++)
-  {
-    minval = (d[i]<minval) ? d[i] : minval;
-    maxval = (maxval<d[i]) ? d[i] : maxval;
-  }
-
-  return Vec<2,T>(minval,maxval);
-}
-
-template<typename T>
-Vec2i argmin(const Array2<T>& a)
-{
-  assert(numel(a)>0);
-
-  T minValue = a(0,0);
-  Vec2i minIndex = Vec2i(0,0);
-
-  for(int j=0;j<a.height();j++)
-  {
-    for(int i=0;i<a.width();i++)
-    {
-      if (a(i,j)<minValue)
-      {
-        minValue = a(i,j);
-        minIndex = Vec2i(i,j);
-      }
-    }
-  }
-
-  return minIndex;
-}
-
-template<typename T>
-Vec2i argmax(const Array2<T>& a)
-{
-  assert(numel(a)>0);
-
-  T maxValue = a(0,0);
-  Vec2i maxIndex = Vec2i(0,0);
-
-  for(int j=0;j<a.height();j++)
-  {
-    for(int i=0;i<a.width();i++)
-    {
-      if (maxValue<a(i,j))
-      {
-        maxValue = a(i,j);
-        maxIndex = Vec2i(i,j);
-      }
-    }
-  }
-
-  return maxIndex;
-}
-
-template<typename T>
-T sum(const Array2<T>& a)
-{
-  assert(numel(a)>0);
-
-  const int n = numel(a);
-
-  const T* d = a.data();
-
-  T sumval = d[0];
-
-  for(int i=1;i<n;i++) sumval += d[i];
-
-  return sumval;
-}
-
-template<typename T>
-void fill(Array2<T>* a,const T& value)
-{
-  assert(a!=0);
-  assert(a->numel()>0);
-
-  const int n = a->numel();
-  T* d = a->data();
-
-  for(int i=0;i<n;i++) d[i] = value;
-}
-
-template<typename T,typename F>
-Array2<T> apply(const Array2<T>& a,F fun)
-{
-  assert(numel(a) > 0);
-
-  Array2<T> fun_a(size(a));
-
-  const int n = numel(a);
-
-  for(int i=0;i<n;i++) fun_a.data()[i] = fun(a.data()[i]);
-
-  return fun_a;
-}
-
-template<typename T>
-Array3<T>::Array3() : s(0,0,0),d(0) {}
-
-template<typename T>
-Array3<T>::Array3(int width,int height,int depth)
-{
-  assert(width>0 && height>0 && depth>0);
-  s = Vec3i(width,height,depth);
-  d = new T[s(0)*s(1)*s(2)];
-}
-
-template<typename T>
-Array3<T>::Array3(const Vec3i& size)
-{
-  // XXX: predelat na neco jako assert(all(s>0));
-  assert(size(0)>0 && size(1)>0 && size(2)>0);
-  s = size;
-  d = new T[s(0)*s(1)*s(2)];
-}
-
-template<typename T>
-Array3<T>::Array3(const Array3<T>& a)
-{
-  //  printf("COPY CONSTRUCTOR\n");
-  s = a.s;
-
-  if (s(0)>0 && s(1)>0 && s(2)>0)
-  {
-    d = new T[s(0)*s(1)*s(2)];
-
-    // XXX: optimize this:
-    for(int i=0;i<s(0)*s(1)*s(2);i++) d[i] = a.d[i];
-    //memcpy((void *)d, (void *)a.d, sizeof(T)*s(0)*s(1)*s(2));
-  }
-  else
-  {
-    d = 0;
-  }
-}
-
-template<typename T>
-Array3<T>& Array3<T>::operator=(const Array3<T>& a)
-{
-  // printf("ASSIGNMENT\n");
-  // printf("slow copy\n");
-  if (this!=&a)
-  {
-    if (s(0)==a.s(0) && s(1)==a.s(1) && s(2)==a.s(2))
-    {
-      // XXX: optimize this:
-      for(int i=0;i<s(0)*s(1)*s(2);i++) d[i] = a.d[i];
-      //memcpy((void *)d, (void *)a.d, sizeof(T)*s(0)*s(1)*s(2));
-    }
-    else
-    {
-      delete[] d;
-      s = a.s;
-
-      if (a.s(0)>0 && a.s(1)>0 && a.s(2)>0)
-      {
-        d = new T[s(0)*s(1)*s(2)];
-        // XXX: optimize this:
-        for(int i=0;i<s(0)*s(1)*s(2);i++) d[i] = a.d[i];
-        //memcpy((void *)d, (void *)a.d, sizeof(T)*s(0)*s(1)*s(2));
-      }
-      else
-      {
-        d = 0;
-      }
-    }
-  }
-  else
-  {
-  //  printf("SELF ASSIGNMENT\n");
-  }
-
-  return *this;
-}
-
-template<typename T>
-Array3<T>::~Array3()
-{
-  delete[] d;
-}
-
-template<typename T>
-inline T& Array3<T>::operator[](int i)
-{
-  assert(i>=0 && i<numel());
-
-  return d[i];
-}
-
-template<typename T>
-inline const T& Array3<T>::operator[](int i) const
-{
-  assert(i>=0 && i<numel());
-
-  return d[i];
-}
-
-template<typename T>
-inline T& Array3<T>::operator()(int i,int j,int k)
-{
-  assert(d!=0);
-  assert(i>=0 && i<s(0) &&
-         j>=0 && j<s(1) &&
-         k>=0 && k<s(2));
-
-  return d[i+(j+k*s(1))*s(0)];
-}
-
-template<typename T>
-inline const T& Array3<T>::operator()(int i,int j,int k) const
-{
-  assert(d!=0);
-  assert(i>=0 && i<s(0) &&
-         j>=0 && j<s(1) &&
-         k>=0 && k<s(2));
-
-  return d[i+(j+k*s(1))*s(0)];
-}
-
-template<typename T>
-inline T& Array3<T>::operator()(const Vec<3,int>& ijk)
-{
-  assert(d!=0);
-  assert(ijk(0)>=0 && ijk(0)<s(0) &&
-         ijk(1)>=0 && ijk(1)<s(1) &&
-         ijk(2)>=0 && ijk(2)<s(2));
-
-  return d[ijk(0)+(ijk(1)+ijk(2)*s(1))*s(0)];
-}
-
-template<typename T>
-inline const T& Array3<T>::operator()(const Vec<3,int>& ijk) const
-{
-  assert(d!=0);
-  assert(ijk(0)>=0 && ijk(0)<s(0) &&
-         ijk(1)>=0 && ijk(1)<s(1) &&
-         ijk(2)>=0 && ijk(2)<s(2));
-
-  return d[ijk(0)+(ijk(1)+ijk(2)*s(1))*s(0)];
-}
-
-template<typename T>
-Vec3i Array3<T>::size() const
-{
-  return s;
-}
-
-template<typename T>
-int Array3<T>::size(int dim) const
-{
-  assert(dim==0 || dim==1 || dim==2);
-  return size()(dim);
-}
-
-template<typename T>
-int Array3<T>::width() const
-{
-  return size(0);
-}
-
-template<typename T>
-int Array3<T>::height() const
-{
-  return size(1);
-}
-
-template<typename T>
-int Array3<T>::depth() const
-{
-  return size(2);
-}
-
-template<typename T>
-int Array3<T>::numel() const
-{
-  return size(0)*size(1)*size(2);
-}
-
-template<typename T>
-T* Array3<T>::data()
-{
-  return d;
-}
-
-template<typename T>
-const T* Array3<T>::data() const
-{
-  return d;
-}
-
-template<typename T>
-void Array3<T>::clear()
-{
-  delete[] d;
-  s = Vec3i(0,0,0);
-  d = 0;
-}
-
-template<typename T>
-void Array3<T>::swap(Array3<T>& b)
-{
-  Vec3i tmp_s = s;
-  s = b.s;
-  b.s = tmp_s;
-
-  T* tmp_d = d;
-  d = b.d;
-  b.d = tmp_d;
-}
-
-template<typename T>
-Vec3i size(const Array3<T>& a)
-{
-  return a.size();
-}
-
-template<typename T>
-int size(const Array3<T>& a,int dim)
-{
-  return a.size(dim);
-}
-
-template<typename T>
-int numel(const Array3<T>& a)
-{
-  return a.numel();
-}
-
-template<typename T>
-void clear(Array3<T>* a)
-{
-  a->clear();
-}
-
-template<typename T>
-void swap(Array3<T>& a,Array3<T>& b)
-{
-  a.swap(b);
-}
-
-#endif
+// This software is in the public domain. Where that dedication is not
+// recognized, you are granted a perpetual, irrevocable license to copy
+// and modify this file as you see fit.
+
+#ifndef JZQ_H_
+#define JZQ_H_
+
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstdarg>
+#include <vector>
+#include <string>
+#include <algorithm>
+
+#ifdef __CUDACC__
+  #define JZQ_DECORATOR __host__ __device__
+#else
+  #define JZQ_DECORATOR
+#endif
+
+template<typename T> struct zero { static JZQ_DECORATOR T value(); };
+
+template<typename T> JZQ_DECORATOR inline T clamp(const T& x,const T& xmin,const T& xmax);
+template<typename T> JZQ_DECORATOR inline T lerp(const T& a,const T& b,float t);
+
+inline std::string spf(const std::string fmt,...);
+
+template<int N,typename T>
+struct Vec
+{
+  T v[N];
+
+  JZQ_DECORATOR Vec<N,T>();
+  template<typename T2> JZQ_DECORATOR explicit Vec<N,T>(const Vec<N,T2>& u);
+  explicit JZQ_DECORATOR Vec<N,T>(T v0);
+
+  JZQ_DECORATOR Vec<N,T>(T v0,T v1);
+  JZQ_DECORATOR Vec<N,T>(T v0,T v1,T v2);
+  JZQ_DECORATOR Vec<N,T>(T v0,T v1,T v2,T v3);
+  JZQ_DECORATOR Vec<N,T>(T v0,T v1,T v2,T v3,T v4);
+  JZQ_DECORATOR Vec<N,T>(T v0,T v1,T v2,T v3,T v4,T v5);
+
+  JZQ_DECORATOR T&       operator()(int i);
+  JZQ_DECORATOR const T& operator()(int i) const;
+  JZQ_DECORATOR T&       operator[](int i);
+  JZQ_DECORATOR const T& operator[](int i) const;
+
+  JZQ_DECORATOR Vec<N,T> operator*=(const Vec<N,T>& u);
+  JZQ_DECORATOR Vec<N,T> operator+=(const Vec<N,T>& u);
+
+  JZQ_DECORATOR Vec<N,T> operator*=(T s);
+  JZQ_DECORATOR Vec<N,T> operator+=(T s);
+};
+
+template<int N,typename T> Vec<N,T> JZQ_DECORATOR operator-(const Vec<N,T>& u);
+template<int N,typename T> Vec<N,T> JZQ_DECORATOR operator+(const Vec<N,T>& u,const Vec<N,T>& v);
+template<int N,typename T> Vec<N,T> JZQ_DECORATOR operator-(const Vec<N,T>& u,const Vec<N,T>& v);
+template<int N,typename T> Vec<N,T> JZQ_DECORATOR operator-(const Vec<N,T>& u,const T v);
+template<int N,typename T> Vec<N,T> JZQ_DECORATOR operator*(const Vec<N,T>& u,const Vec<N,T>& v);
+template<int N,typename T> Vec<N,T> JZQ_DECORATOR operator/(const Vec<N,T>& u,const Vec<N,T>& v);
+template<int N,typename T> Vec<N,T> JZQ_DECORATOR operator*(const T s,const Vec<N,T>& u);
+template<int N,typename T> Vec<N,T> JZQ_DECORATOR operator*(const Vec<N,T>& u,const T s);
+template<int N,typename T> Vec<N,T> JZQ_DECORATOR operator/(const Vec<N,T>& u,const T s);
+
+template<int N,typename T> Vec<N,bool> JZQ_DECORATOR operator<(const Vec<N,T>& u,const Vec<N,T>& v);
+template<int N,typename T> Vec<N,bool> JZQ_DECORATOR operator>(const Vec<N,T>& u,const Vec<N,T>& v);
+template<int N,typename T> Vec<N,bool> JZQ_DECORATOR operator<=(const Vec<N,T>& u,const Vec<N,T>& v);
+template<int N,typename T> Vec<N,bool> JZQ_DECORATOR operator>=(const Vec<N,T>& u,const Vec<N,T>& v);
+template<int N,typename T> Vec<N,bool> JZQ_DECORATOR operator==(const Vec<N,T>& u,const Vec<N,T>& v);
+template<int N,typename T> Vec<N,bool> JZQ_DECORATOR operator!=(const Vec<N,T>& u,const Vec<N,T>& v);
+
+template<int N,typename T> JZQ_DECORATOR inline T        dot(const Vec<N,T>& u,const Vec<N,T>& v);
+template<typename T>       JZQ_DECORATOR inline T        cross(const Vec<2,T> &a,const Vec<2,T> &b);
+template<typename T>       JZQ_DECORATOR inline Vec<3,T> cross(const Vec<3,T> &a,const Vec<3,T> &b);
+template<int N,typename T> JZQ_DECORATOR inline T        norm(const Vec<N,T>& u);
+template<int N,typename T> JZQ_DECORATOR inline Vec<N,T> normalize(const Vec<N,T>& u);
+template<int N,typename T> JZQ_DECORATOR inline T        min(const Vec<N,T>& u);
+template<int N,typename T> JZQ_DECORATOR inline T        max(const Vec<N,T>& u);
+template<int N,typename T> JZQ_DECORATOR inline T        sum(const Vec<N,T>& u);
+namespace std
+{
+template<int N,typename T> inline Vec<N,T> min(const Vec<N,T>& u,const Vec<N,T>& v);
+template<int N,typename T> inline Vec<N,T> max(const Vec<N,T>& u,const Vec<N,T>& v);
+}
+template<int N,typename T> inline Vec<N,T> abs(const Vec<N,T>& x);
+
+template<int N>            inline bool     any(const Vec<N,bool>& u);
+template<int N>            inline bool     all(const Vec<N,bool>& u);
+
+template<int M,int N,typename T>
+struct Mat
+{
+  T m[M][N];
+
+  Mat<M,N,T>();
+
+  Mat<M,N,T>(T a00,T a01,
+             T a10,T a11);
+
+  Mat<M,N,T>(T a00,T a01,T a02,
+             T a10,T a11,T a12,
+             T a20,T a21,T a22);
+
+  Mat<M,N,T>(T a00,T a01,T a02,T a03,
+             T a10,T a11,T a12,T a13,
+             T a20,T a21,T a22,T a23,
+             T a30,T a31,T a32,T a33);
+
+  T&       operator()(int i,int j);
+  const T& operator()(int i,int j) const;
+
+  T*       data();
+  const T* data() const;
+};
+
+template<int M1,int N1,int M2,int N2,typename T> Mat<M1,N2,T> operator*(const Mat<M1,N1,T>& A,const Mat<M2,N2,T>& B);
+
+template<int M,int N,typename T> Vec<M,T> operator*(const Mat<M,N,T>& A,const Vec<N,T>& u);
+template<int M,int N,typename T> Vec<N,T> operator*(const Vec<M,T>& u,const Mat<M,N,T>& A);
+
+template<int M,int N,typename T> Mat<N,M,T> transpose(const Mat<M,N,T>& A);
+template<int N,typename T>       T          trace(const Mat<N,N,T>& A);
+template<int N,typename T>       Mat<N,N,T> inverse(const Mat<N,N,T>& A);
+
+template<typename T>
+class Array2
+{
+public:
+  Array2();
+  Array2(int width,int height);
+  explicit Array2(const Vec<2,int>& size);
+  Array2(const Array2<T>& a);
+  ~Array2();
+
+  Array2&  operator=(const Array2<T>& a);
+
+  inline T&       operator[](int i);
+  inline const T& operator[](int i) const;
+  inline T&       operator()(int i,int j);
+  inline const T& operator()(int i,int j) const;
+  inline T&       operator()(const Vec<2,int>& ij);
+  inline const T& operator()(const Vec<2,int>& ij) const;
+
+  Vec<2,int> size() const;
+  int        size(int dim) const;
+  int        width() const;
+  int        height() const;
+  int        numel() const;
+  T*         data();
+  const T*   data() const;
+  void       clear();
+  void       swap(Array2<T>& b);
+  bool       empty() const;
+
+private:
+  Vec<2,int> s;
+  T* d;
+};
+
+template<typename T> Vec<2,int> size(const Array2<T>& a);
+template<typename T> int        size(const Array2<T>& a,int dim);
+template<typename T> int        numel(const Array2<T>& a);
+template<typename T> void       clear(Array2<T>* a);
+template<typename T> void       swap(Array2<T>& a,Array2<T>& b);
+template<typename T> T          min(const Array2<T>& a);
+template<typename T> T          max(const Array2<T>& a);
+template<typename T> Vec<2,T>   minmax(const Array2<T>& a);
+template<typename T> Vec<2,int> argmin(const Array2<T>& a);
+template<typename T> Vec<2,int> argmax(const Array2<T>& a);
+template<typename T> T          sum(const Array2<T>& a);
+template<typename T> void       fill(Array2<T>* a,const T& value);
+
+template<typename T,typename F> Array2<T> apply(const Array2<T>& a,F fun);
+
+template<typename T>
+class Array3
+{
+public:
+  Array3();
+  explicit Array3(const Vec<3,int>& size);
+  Array3(int width,int height,int depth);
+  Array3(const Array3<T>& a);
+  ~Array3();
+
+  Array3& operator=(const Array3<T>& a);
+
+  inline T&       operator[](int i);
+  inline const T& operator[](int i) const;
+  inline T&       operator()(int i,int j,int k);
+  inline const T& operator()(int i,int j,int k) const;
+  inline T&       operator()(const Vec<3,int>& ijk);
+  inline const T& operator()(const Vec<3,int>& ijk) const;
+
+  Vec<3,int> size() const;
+  int        size(int dim) const;
+  int        width() const;
+  int        height() const;
+  int        depth() const;
+  int        numel() const;
+  T*         data();
+  const T*   data() const;
+  void       clear();
+  void       swap(Array3<T>& b);
+  bool       empty() const;
+
+private:
+  Vec<3,int> s;
+  T* d;
+};
+
+template<typename T> Vec<3,int> size(const Array3<T>& a);
+template<typename T> int        size(const Array3<T>& a,int dim);
+template<typename T> int        numel(const Array3<T>& a);
+template<typename T> void       clear(Array3<T>* a);
+template<typename T> void       swap(Array3<T>& a,Array3<T>& b);
+
+typedef Vec<2,double>         Vec2d;
+typedef Vec<2,float>          Vec2f;
+typedef Vec<2,int>            Vec2i;
+typedef Vec<2,unsigned int>   Vec2ui;
+typedef Vec<2,short>          Vec2s;
+typedef Vec<2,unsigned short> Vec2us;
+typedef Vec<2,char>           Vec2c;
+typedef Vec<2,unsigned char>  Vec2uc;
+
+typedef Vec<3,double>         Vec3d;
+typedef Vec<3,float>          Vec3f;
+typedef Vec<3,int>            Vec3i;
+typedef Vec<3,unsigned int>   Vec3ui;
+typedef Vec<3,short>          Vec3s;
+typedef Vec<3,unsigned short> Vec3us;
+typedef Vec<3,char>           Vec3c;
+typedef Vec<3,unsigned char>  Vec3uc;
+
+typedef Vec<4,double>         Vec4d;
+typedef Vec<4,float>          Vec4f;
+typedef Vec<4,int>            Vec4i;
+typedef Vec<4,unsigned int>   Vec4ui;
+typedef Vec<4,short>          Vec4s;
+typedef Vec<4,unsigned short> Vec4us;
+typedef Vec<4,char>           Vec4c;
+typedef Vec<4,unsigned char>  Vec4uc;
+
+typedef Vec<5,double>         Vec5d;
+typedef Vec<5,float>          Vec5f;
+typedef Vec<5,int>            Vec5i;
+typedef Vec<5,unsigned int>   Vec5ui;
+typedef Vec<5,short>          Vec5s;
+typedef Vec<5,unsigned short> Vec5us;
+typedef Vec<5,char>           Vec5c;
+typedef Vec<5,unsigned char>  Vec5uc;
+
+typedef Vec<6,double>         Vec6d;
+typedef Vec<6,float>          Vec6f;
+typedef Vec<6,int>            Vec6i;
+typedef Vec<6,unsigned int>   Vec6ui;
+typedef Vec<6,short>          Vec6s;
+typedef Vec<6,unsigned short> Vec6us;
+typedef Vec<6,char>           Vec6c;
+typedef Vec<6,unsigned char>  Vec6uc;
+
+typedef Vec<2,double>         V2d;
+typedef Vec<2,float>          V2f;
+typedef Vec<2,int>            V2i;
+typedef Vec<2,unsigned int>   V2ui;
+typedef Vec<2,short>          V2s;
+typedef Vec<2,unsigned short> V2us;
+typedef Vec<2,char>           V2c;
+typedef Vec<2,unsigned char>  V2uc;
+
+typedef Vec<3,double>         V3d;
+typedef Vec<3,float>          V3f;
+typedef Vec<3,int>            V3i;
+typedef Vec<3,unsigned int>   V3ui;
+typedef Vec<3,short>          V3s;
+typedef Vec<3,unsigned short> V3us;
+typedef Vec<3,char>           V3c;
+typedef Vec<3,unsigned char>  V3uc;
+
+typedef Vec<4,double>         V4d;
+typedef Vec<4,float>          V4f;
+typedef Vec<4,int>            V4i;
+typedef Vec<4,unsigned int>   V4ui;
+typedef Vec<4,short>          V4s;
+typedef Vec<4,unsigned short> V4us;
+typedef Vec<4,char>           V4c;
+typedef Vec<4,unsigned char>  V4uc;
+
+typedef Vec<5,double>         V5d;
+typedef Vec<5,float>          V5f;
+typedef Vec<5,int>            V5i;
+typedef Vec<5,unsigned int>   V5ui;
+typedef Vec<5,short>          V5s;
+typedef Vec<5,unsigned short> V5us;
+typedef Vec<5,char>           V5c;
+typedef Vec<5,unsigned char>  V5uc;
+
+typedef Vec<6,double>         V6d;
+typedef Vec<6,float>          V6f;
+typedef Vec<6,int>            V6i;
+typedef Vec<6,unsigned int>   V6ui;
+typedef Vec<6,short>          V6s;
+typedef Vec<6,unsigned short> V6us;
+typedef Vec<6,char>           V6c;
+typedef Vec<6,unsigned char>  V6uc;
+
+typedef Mat<2,2,float> Mat2x2f;
+typedef Mat<2,3,float> Mat2x3f;
+typedef Mat<2,4,float> Mat2x4f;
+typedef Mat<2,5,float> Mat2x5f;
+typedef Mat<2,6,float> Mat2x6f;
+typedef Mat<2,7,float> Mat2x7f;
+typedef Mat<2,8,float> Mat2x8f;
+typedef Mat<3,2,float> Mat3x2f;
+typedef Mat<3,3,float> Mat3x3f;
+typedef Mat<3,4,float> Mat3x4f;
+typedef Mat<3,5,float> Mat3x5f;
+typedef Mat<3,6,float> Mat3x6f;
+typedef Mat<3,7,float> Mat3x7f;
+typedef Mat<3,8,float> Mat3x8f;
+typedef Mat<4,2,float> Mat4x2f;
+typedef Mat<4,3,float> Mat4x3f;
+typedef Mat<4,4,float> Mat4x4f;
+typedef Mat<4,5,float> Mat4x5f;
+typedef Mat<4,6,float> Mat4x6f;
+typedef Mat<4,7,float> Mat4x7f;
+typedef Mat<4,8,float> Mat4x8f;
+typedef Mat<5,2,float> Mat5x2f;
+typedef Mat<5,3,float> Mat5x3f;
+typedef Mat<5,4,float> Mat5x4f;
+typedef Mat<5,5,float> Mat5x5f;
+typedef Mat<5,6,float> Mat5x6f;
+typedef Mat<5,7,float> Mat5x7f;
+typedef Mat<5,8,float> Mat5x8f;
+typedef Mat<6,2,float> Mat6x2f;
+typedef Mat<6,3,float> Mat6x3f;
+typedef Mat<6,4,float> Mat6x4f;
+typedef Mat<6,5,float> Mat6x5f;
+typedef Mat<6,6,float> Mat6x6f;
+typedef Mat<6,7,float> Mat6x7f;
+typedef Mat<6,8,float> Mat6x8f;
+typedef Mat<7,2,float> Mat7x2f;
+typedef Mat<7,3,float> Mat7x3f;
+typedef Mat<7,4,float> Mat7x4f;
+typedef Mat<7,5,float> Mat7x5f;
+typedef Mat<7,6,float> Mat7x6f;
+typedef Mat<7,7,float> Mat7x7f;
+typedef Mat<7,8,float> Mat7x8f;
+typedef Mat<8,2,float> Mat8x2f;
+typedef Mat<8,3,float> Mat8x3f;
+typedef Mat<8,4,float> Mat8x4f;
+typedef Mat<8,5,float> Mat8x5f;
+typedef Mat<8,6,float> Mat8x6f;
+typedef Mat<8,7,float> Mat8x7f;
+typedef Mat<8,8,float> Mat8x8f;
+
+typedef Mat<2,2,double> Mat2x2d;
+typedef Mat<2,3,double> Mat2x3d;
+typedef Mat<2,4,double> Mat2x4d;
+typedef Mat<2,5,double> Mat2x5d;
+typedef Mat<2,6,double> Mat2x6d;
+typedef Mat<2,7,double> Mat2x7d;
+typedef Mat<2,8,double> Mat2x8d;
+typedef Mat<3,2,double> Mat3x2d;
+typedef Mat<3,3,double> Mat3x3d;
+typedef Mat<3,4,double> Mat3x4d;
+typedef Mat<3,5,double> Mat3x5d;
+typedef Mat<3,6,double> Mat3x6d;
+typedef Mat<3,7,double> Mat3x7d;
+typedef Mat<3,8,double> Mat3x8d;
+typedef Mat<4,2,double> Mat4x2d;
+typedef Mat<4,3,double> Mat4x3d;
+typedef Mat<4,4,double> Mat4x4d;
+typedef Mat<4,5,double> Mat4x5d;
+typedef Mat<4,6,double> Mat4x6d;
+typedef Mat<4,7,double> Mat4x7d;
+typedef Mat<4,8,double> Mat4x8d;
+typedef Mat<5,2,double> Mat5x2d;
+typedef Mat<5,3,double> Mat5x3d;
+typedef Mat<5,4,double> Mat5x4d;
+typedef Mat<5,5,double> Mat5x5d;
+typedef Mat<5,6,double> Mat5x6d;
+typedef Mat<5,7,double> Mat5x7d;
+typedef Mat<5,8,double> Mat5x8d;
+typedef Mat<6,2,double> Mat6x2d;
+typedef Mat<6,3,double> Mat6x3d;
+typedef Mat<6,4,double> Mat6x4d;
+typedef Mat<6,5,double> Mat6x5d;
+typedef Mat<6,6,double> Mat6x6d;
+typedef Mat<6,7,double> Mat6x7d;
+typedef Mat<6,8,double> Mat6x8d;
+typedef Mat<7,2,double> Mat7x2d;
+typedef Mat<7,3,double> Mat7x3d;
+typedef Mat<7,4,double> Mat7x4d;
+typedef Mat<7,5,double> Mat7x5d;
+typedef Mat<7,6,double> Mat7x6d;
+typedef Mat<7,7,double> Mat7x7d;
+typedef Mat<7,8,double> Mat7x8d;
+typedef Mat<8,2,double> Mat8x2d;
+typedef Mat<8,3,double> Mat8x3d;
+typedef Mat<8,4,double> Mat8x4d;
+typedef Mat<8,5,double> Mat8x5d;
+typedef Mat<8,6,double> Mat8x6d;
+typedef Mat<8,7,double> Mat8x7d;
+typedef Mat<8,8,double> Mat8x8d;
+
+typedef Array2<double>                  Array2d;
+typedef Array2<float>                   Array2f;
+typedef Array2<int>                     Array2i;
+typedef Array2<unsigned int>            Array2ui;
+typedef Array2<short>                   Array2s;
+typedef Array2<unsigned short>          Array2us;
+typedef Array2<char>                    Array2c;
+typedef Array2<unsigned char>           Array2uc;
+
+typedef Array2< Vec<2,double> >         Array2V2d;
+typedef Array2< Vec<2,float> >          Array2V2f;
+typedef Array2< Vec<2,int> >            Array2V2i;
+typedef Array2< Vec<2,unsigned int> >   Array2V2ui;
+typedef Array2< Vec<2,short> >          Array2V2s;
+typedef Array2< Vec<2,unsigned short> > Array2V2us;
+typedef Array2< Vec<2,char> >           Array2V2c;
+typedef Array2< Vec<2,unsigned char> >  Array2V2uc;
+
+typedef Array2< Vec<3,double> >         Array2V3d;
+typedef Array2< Vec<3,float> >          Array2V3f;
+typedef Array2< Vec<3,int> >            Array2V3i;
+typedef Array2< Vec<3,unsigned int> >   Array2V3ui;
+typedef Array2< Vec<3,short> >          Array2V3s;
+typedef Array2< Vec<3,unsigned short> > Array2V3us;
+typedef Array2< Vec<3,char> >           Array2V3c;
+typedef Array2< Vec<3,unsigned char> >  Array2V3uc;
+
+typedef Array2< Vec<4,double> >         Array2V4d;
+typedef Array2< Vec<4,float> >          Array2V4f;
+typedef Array2< Vec<4,int> >            Array2V4i;
+typedef Array2< Vec<4,unsigned int> >   Array2V4ui;
+typedef Array2< Vec<4,short> >          Array2V4s;
+typedef Array2< Vec<4,unsigned short> > Array2V4us;
+typedef Array2< Vec<4,char> >           Array2V4c;
+typedef Array2< Vec<4,unsigned char> >  Array2V4uc;
+
+typedef Array2<double>                  A2d;
+typedef Array2<float>                   A2f;
+typedef Array2<int>                     A2i;
+typedef Array2<unsigned int>            A2ui;
+typedef Array2<short>                   A2s;
+typedef Array2<unsigned short>          A2us;
+typedef Array2<char>                    A2c;
+typedef Array2<unsigned char>           A2uc;
+
+typedef Array2< Vec<2,double> >         A2V2d;
+typedef Array2< Vec<2,float> >          A2V2f;
+typedef Array2< Vec<2,int> >            A2V2i;
+typedef Array2< Vec<2,unsigned int> >   A2V2ui;
+typedef Array2< Vec<2,short> >          A2V2s;
+typedef Array2< Vec<2,unsigned short> > A2V2us;
+typedef Array2< Vec<2,char> >           A2V2c;
+typedef Array2< Vec<2,unsigned char> >  A2V2uc;
+
+typedef Array2< Vec<3,double> >         A2V3d;
+typedef Array2< Vec<3,float> >          A2V3f;
+typedef Array2< Vec<3,int> >            A2V3i;
+typedef Array2< Vec<3,unsigned int> >   A2V3ui;
+typedef Array2< Vec<3,short> >          A2V3s;
+typedef Array2< Vec<3,unsigned short> > A2V3us;
+typedef Array2< Vec<3,char> >           A2V3c;
+typedef Array2< Vec<3,unsigned char> >  A2V3uc;
+
+typedef Array2< Vec<4,double> >         A2V4d;
+typedef Array2< Vec<4,float> >          A2V4f;
+typedef Array2< Vec<4,int> >            A2V4i;
+typedef Array2< Vec<4,unsigned int> >   A2V4ui;
+typedef Array2< Vec<4,short> >          A2V4s;
+typedef Array2< Vec<4,unsigned short> > A2V4us;
+typedef Array2< Vec<4,char> >           A2V4c;
+typedef Array2< Vec<4,unsigned char> >  A2V4uc;
+
+typedef Array3<double>                  Array3d;
+typedef Array3<float>                   Array3f;
+typedef Array3<int>                     Array3i;
+typedef Array3<unsigned int>            Array3ui;
+typedef Array3<short>                   Array3s;
+typedef Array3<unsigned short>          Array3us;
+typedef Array3<char>                    Array3c;
+typedef Array3<unsigned char>           Array3uc;
+
+typedef Array3< Vec<2,double> >         Array3V2d;
+typedef Array3< Vec<2,float> >          Array3V2f;
+typedef Array3< Vec<2,int> >            Array3V2i;
+typedef Array3< Vec<2,unsigned int> >   Array3V2ui;
+typedef Array3< Vec<2,short> >          Array3V2s;
+typedef Array3< Vec<2,unsigned short> > Array3V2us;
+typedef Array3< Vec<2,char> >           Array3V2c;
+typedef Array3< Vec<2,unsigned char> >  Array3V2uc;
+
+typedef Array3< Vec<3,double> >         Array3V3d;
+typedef Array3< Vec<3,float> >          Array3V3f;
+typedef Array3< Vec<3,int> >            Array3V3i;
+typedef Array3< Vec<3,unsigned int> >   Array3V3ui;
+typedef Array3< Vec<3,short> >          Array3V3s;
+typedef Array3< Vec<3,unsigned short> > Array3V3us;
+typedef Array3< Vec<3,char> >           Array3V3c;
+typedef Array3< Vec<3,unsigned char> >  Array3V3uc;
+
+typedef Array3< Vec<4,double> >         Array3V4d;
+typedef Array3< Vec<4,float> >          Array3V4f;
+typedef Array3< Vec<4,int> >            Array3V4i;
+typedef Array3< Vec<4,unsigned int> >   Array3V4ui;
+typedef Array3< Vec<4,short> >          Array3V4s;
+typedef Array3< Vec<4,unsigned short> > Array3V4us;
+typedef Array3< Vec<4,char> >           Array3V4c;
+typedef Array3< Vec<4,unsigned char> >  Array3V4uc;
+
+typedef Array3<double>                  A3d;
+typedef Array3<float>                   A3f;
+typedef Array3<int>                     A3i;
+typedef Array3<unsigned int>            A3ui;
+typedef Array3<short>                   A3s;
+typedef Array3<unsigned short>          A3us;
+typedef Array3<char>                    A3c;
+typedef Array3<unsigned char>           A3uc;
+
+typedef Array3< Vec<2,double> >         A3V2d;
+typedef Array3< Vec<2,float> >          A3V2f;
+typedef Array3< Vec<2,int> >            A3V2i;
+typedef Array3< Vec<2,unsigned int> >   A3V2ui;
+typedef Array3< Vec<2,short> >          A3V2s;
+typedef Array3< Vec<2,unsigned short> > A3V2us;
+typedef Array3< Vec<2,char> >           A3V2c;
+typedef Array3< Vec<2,unsigned char> >  A3V2uc;
+
+typedef Array3< Vec<3,double> >         A3V3d;
+typedef Array3< Vec<3,float> >          A3V3f;
+typedef Array3< Vec<3,int> >            A3V3i;
+typedef Array3< Vec<3,unsigned int> >   A3V3ui;
+typedef Array3< Vec<3,short> >          A3V3s;
+typedef Array3< Vec<3,unsigned short> > A3V3us;
+typedef Array3< Vec<3,char> >           A3V3c;
+typedef Array3< Vec<3,unsigned char> >  A3V3uc;
+
+typedef Array3< Vec<4,double> >         A3V4d;
+typedef Array3< Vec<4,float> >          A3V4f;
+typedef Array3< Vec<4,int> >            A3V4i;
+typedef Array3< Vec<4,unsigned int> >   A3V4ui;
+typedef Array3< Vec<4,short> >          A3V4s;
+typedef Array3< Vec<4,unsigned short> > A3V4us;
+typedef Array3< Vec<4,char> >           A3V4c;
+typedef Array3< Vec<4,unsigned char> >  A3V4uc;
+
+template<> struct zero<char          > { static JZQ_DECORATOR char           value() { return 0;    } };
+template<> struct zero<unsigned char > { static JZQ_DECORATOR unsigned char  value() { return 0;    } };
+template<> struct zero<short         > { static JZQ_DECORATOR short          value() { return 0;    } };
+template<> struct zero<unsigned short> { static JZQ_DECORATOR unsigned short value() { return 0;    } };
+template<> struct zero<int           > { static JZQ_DECORATOR int            value() { return 0;    } };
+template<> struct zero<unsigned int  > { static JZQ_DECORATOR unsigned int   value() { return 0;    } };
+template<> struct zero<float         > { static JZQ_DECORATOR float          value() { return 0.0f; } };
+template<> struct zero<double        > { static JZQ_DECORATOR double         value() { return 0.0;  } };
+
+template<int N,typename T>
+struct zero<Vec<N,T>>
+{
+  static JZQ_DECORATOR Vec<N,T> value()
+  {
+    Vec<N,T> z;
+    for(int i=0;i<N;i++) { z[i] = zero<T>::value(); }
+    return z;
+  }
+};
+
+template<int M,int N,typename T>
+struct zero<Mat<M,N,T>>
+{
+  static JZQ_DECORATOR Mat<M,N,T> value()
+  {
+    Mat<M,N,T> z;
+    for(int i=0;i<M;i++)
+    for(int j=0;j<N;j++)
+    {
+      z(i,j) = zero<T>::value();
+    }
+    return z;
+  }
+};
+
+template <typename T> JZQ_DECORATOR inline
+T clamp(const T& x,const T& xmin,const T& xmax)
+{
+  return std::min(std::max(x,xmin),xmax);
+}
+
+template <typename T> JZQ_DECORATOR inline
+T lerp(const T& a,const T& b,float t)
+{
+  return (1.0f-t)*a+t*b;
+}
+
+inline std::string spf(const std::string fmt,...)
+{
+  int size = 1024;
+  std::vector<char> buf;
+  va_list ap;
+
+  while(1)
+  {
+    if(size>16*1024*1024) { return std::string(""); }
+
+    buf.resize(size);
+
+    va_start(ap,fmt);
+    const int n = vsnprintf(&buf[0],size-1,fmt.c_str(),ap);
+    va_end(ap);
+
+    if(n>-1 && n < size)
+    {
+      break;
+    }
+    else if(n>-1)
+    {
+      size = n + 1;
+    }
+    else
+    {
+      size = 2*size;
+    }
+  }
+
+  return std::string(&buf[0]);
+}
+
+template<int N,typename T>
+JZQ_DECORATOR
+Vec<N,T>::Vec()
+{
+}
+
+template<int N,typename T>
+JZQ_DECORATOR
+Vec<N,T>::Vec(T v0)
+{
+  assert(N==1);
+  v[0]=v0;
+}
+
+template<int N,typename T>
+JZQ_DECORATOR
+Vec<N,T>::Vec(T v0,T v1)
+{
+  assert(N==2);
+  v[0]=v0; v[1]=v1;
+}
+
+template<int N,typename T>
+JZQ_DECORATOR
+Vec<N,T>::Vec(T v0,T v1,T v2)
+{
+  assert(N==3);
+  v[0]=v0; v[1]=v1; v[2]=v2;
+}
+
+template<int N,typename T>
+JZQ_DECORATOR
+Vec<N,T>::Vec(T v0,T v1,T v2,T v3)
+{
+  assert(N==4);
+  v[0]=v0; v[1]=v1; v[2]=v2; v[3]=v3;
+}
+
+template<int N,typename T>
+JZQ_DECORATOR
+Vec<N,T>::Vec(T v0,T v1,T v2,T v3,T v4)
+{
+  assert(N==5);
+  v[0]=v0; v[1]=v1; v[2]=v2; v[3]=v3; v[4]=v4;
+}
+
+template<int N,typename T>
+JZQ_DECORATOR
+Vec<N,T>::Vec(T v0,T v1,T v2,T v3,T v4,T v5)
+{
+  assert(N==6);
+  v[0]=v0; v[1]=v1; v[2]=v2; v[3]=v3; v[4]=v4; v[5]=v5;
+}
+
+template<int N,typename T> template<typename T2>
+JZQ_DECORATOR
+Vec<N,T>::Vec(const Vec<N,T2>& u)
+{
+  for(int i=0;i<N;i++)
+  {
+    v[i] = static_cast<T>(u.v[i]);
+  }
+}
+
+template<int N,typename T>
+JZQ_DECORATOR
+T& Vec<N,T>::operator()(int i)
+{
+  assert(i>=0 && i<N);
+  return v[i];
+}
+
+template<int N,typename T>
+JZQ_DECORATOR
+const T& Vec<N,T>::operator()(int i) const
+{
+  assert(i>=0 && i<N);
+  return v[i];
+}
+
+template<int N,typename T>
+JZQ_DECORATOR
+T& Vec<N,T>::operator[](int i)
+{
+  assert(i>=0 && i<N);
+  return v[i];
+}
+
+template<int N,typename T>
+JZQ_DECORATOR
+const T& Vec<N,T>::operator[](int i) const
+{
+  assert(i>=0 && i<N);
+  return v[i];
+}
+
+template<int N,typename T>
+JZQ_DECORATOR
+Vec<N,T> Vec<N,T>::operator*=(const Vec<N,T>& u)
+{
+  for(int i=0;i<N;i++) v[i]*=u(i);
+  return *this;
+}
+
+template<int N,typename T>
+JZQ_DECORATOR
+Vec<N,T> Vec<N,T>::operator+=(const Vec<N,T>& u)
+{
+  for(int i=0;i<N;i++) v[i]+=u(i);
+  return *this;
+}
+
+template<int N,typename T>
+JZQ_DECORATOR
+Vec<N,T> Vec<N,T>::operator*=(T s)
+{
+  for(int i=0;i<N;i++) v[i]*=s;
+  return *this;
+}
+
+template<int N,typename T>
+JZQ_DECORATOR
+Vec<N,T> Vec<N,T>::operator+=(T s)
+{
+  for(int i=0;i<N;i++) v[i]+=s;
+  return *this;
+}
+
+template<int N,typename T>
+JZQ_DECORATOR
+Vec<N,T> operator-(const Vec<N,T>& u)
+{
+  Vec<N,T> r;
+  for(int i=0;i<N;i++) r(i)=-u(i);
+  return r;
+}
+
+template<int N,typename T>
+JZQ_DECORATOR
+Vec<N,T> operator+(const Vec<N,T>& u,const Vec<N,T>& v)
+{
+  Vec<N,T> r;
+  for(int i=0;i<N;i++) r(i)=u(i)+v(i);
+  return r;
+}
+
+template<int N,typename T>
+JZQ_DECORATOR
+Vec<N,T> operator-(const Vec<N,T>& u,const Vec<N,T>& v)
+{
+  Vec<N,T> r;
+  for(int i=0;i<N;i++) r(i)=u(i)-v(i);
+  return r;
+}
+
+template<int N,typename T>
+JZQ_DECORATOR
+Vec<N,T> operator-(const Vec<N,T>& u,const T v)
+{
+  Vec<N,T> r;
+  for(int i=0;i<N;i++) r(i)=u(i)-v;
+  return r;
+}
+
+template<int N,typename T>
+JZQ_DECORATOR
+Vec<N,T> operator*(const Vec<N,T>& u,const Vec<N,T>& v)
+{
+  Vec<N,T> r;
+  for(int i=0;i<N;i++) r(i)=u(i)*v(i);
+  return r;
+}
+
+template<int N,typename T>
+JZQ_DECORATOR
+Vec<N,T> operator/(const Vec<N,T>& u,const Vec<N,T>& v)
+{
+  Vec<N,T> r;
+  for(int i=0;i<N;i++) r(i)=u(i)/v(i);
+  return r;
+}
+
+template<int N,typename T>
+JZQ_DECORATOR
+Vec<N,T> operator*(const T s,const Vec<N,T>& u)
+{
+  Vec<N,T> r;
+  for(int i=0;i<N;i++) r(i)=s*u(i);
+  return r;
+}
+
+template<int N,typename T>
+JZQ_DECORATOR
+Vec<N,T> operator*(const Vec<N,T>& u,const T s)
+{
+  Vec<N,T> r;
+  for(int i=0;i<N;i++) r(i)=u(i)*s;
+  return r;
+}
+
+template<int N,typename T>
+JZQ_DECORATOR
+Vec<N,T> operator/(const Vec<N,T>& u,const T s)
+{
+  Vec<N,T> r;
+  for(int i=0;i<N;i++) r(i)=u(i)/s;
+  return r;
+}
+
+template<int N,typename T>
+JZQ_DECORATOR
+Vec<N,bool> operator<(const Vec<N,T>& u,const Vec<N,T>& v)
+{
+  Vec<N,bool> r;
+  for(int i=0;i<N;i++) r(i)=u(i)<v(i);
+  return r;
+}
+
+template<int N,typename T>
+JZQ_DECORATOR
+Vec<N,bool> operator>(const Vec<N,T>& u,const Vec<N,T>& v)
+{
+  Vec<N,bool> r;
+  for(int i=0;i<N;i++) r(i)=u(i)>v(i);
+  return r;
+}
+
+template<int N,typename T>
+JZQ_DECORATOR
+Vec<N,bool> operator<=(const Vec<N,T>& u,const Vec<N,T>& v)
+{
+  Vec<N,bool> r;
+  for(int i=0;i<N;i++) r(i)=u(i)<=v(i);
+  return r;
+}
+
+template<int N,typename T>
+JZQ_DECORATOR
+Vec<N,bool> operator>=(const Vec<N,T>& u,const Vec<N,T>& v)
+{
+  Vec<N,bool> r;
+  for(int i=0;i<N;i++) r(i)=u(i)>=v(i);
+  return r;
+}
+
+template<int N,typename T>
+JZQ_DECORATOR
+Vec<N,bool> operator==(const Vec<N,T>& u,const Vec<N,T>& v)
+{
+  Vec<N,bool> r;
+  for(int i=0;i<N;i++) r(i) = (u(i)==v(i));
+  return r;
+}
+
+template<int N,typename T>
+JZQ_DECORATOR
+Vec<N,bool> operator!=(const Vec<N,T>& u,const Vec<N,T>& v)
+{
+  Vec<N,bool> r;
+  for(int i=0;i<N;i++) r(i) = (u(i)!=v(i));
+  return r;
+}
+
+template<int N,typename T>
+JZQ_DECORATOR inline T dot(const Vec<N,T>& u,const Vec<N,T>& v)
+{
+  assert(N>0);
+  T sumprod = u(0)*v(0);
+  for(int i=1;i<N;i++) sumprod += u(i)*v(i);
+  return sumprod;
+}
+
+template<typename T>
+JZQ_DECORATOR inline T cross(const Vec<2,T> &a,const Vec<2,T> &b)
+{
+  return a[0]*b[1]-a[1]*b[0];
+}
+
+template<typename T>
+JZQ_DECORATOR inline Vec<3,T> cross(const Vec<3,T> &a,const Vec<3,T> &b)
+{
+  return Vec<3,T>(a[1]*b[2]-a[2]*b[1],
+                  a[2]*b[0]-a[0]*b[2],
+                  a[0]*b[1]-a[1]*b[0]);
+}
+
+template<int N,typename T>
+JZQ_DECORATOR inline T norm(const Vec<N,T>& u)
+{
+  return std::sqrt(dot(u,u));
+}
+
+template<int N,typename T>
+JZQ_DECORATOR inline Vec<N,T> normalize(const Vec<N,T>& u)
+{
+  return u/norm(u);
+}
+
+template<int N>
+JZQ_DECORATOR inline bool any(const Vec<N,bool>& u)
+{
+  for(int i=0;i<N;i++)
+  {
+    if (u(i)==true) return true;
+  }
+  return false;
+}
+
+template<int N>
+JZQ_DECORATOR inline bool all(const Vec<N,bool>& u)
+{
+  for(int i=0;i<N;i++)
+  {
+    if (u(i)==false) return false;
+  }
+  return true;
+}
+
+template<int N,typename T>
+JZQ_DECORATOR inline T min(const Vec<N,T>& u)
+{
+  assert(N>0);
+
+  T minval = u(0);
+
+  for(int i=1;i<N;i++)
+  {
+   if (u(i) < minval)
+   {
+      minval = u(i);
+   }
+  }
+
+  return minval;
+}
+
+template<int N,typename T>
+JZQ_DECORATOR inline T max(const Vec<N,T>& u)
+{
+  assert(N>0);
+
+  T maxval = u(0);
+
+  for(int i=1;i<N;i++)
+  {
+   if (u(i) > maxval)
+   {
+      maxval = u(i);
+   }
+  }
+
+  return maxval;
+}
+
+template<int N,typename T>
+JZQ_DECORATOR inline T sum(const Vec<N,T>& u)
+{
+  assert(N>0);
+
+  T sumval = u(0);
+
+  for(int i=1;i<N;i++)
+  {
+    sumval += u(i);
+  }
+
+  return sumval;
+}
+
+
+namespace std
+{
+template<int N,typename T> Vec<N,T>
+inline min(const Vec<N,T>& u,const Vec<N,T>& v)
+{
+  assert(N>0);
+
+  Vec<N,T> w;
+
+  for(int i=0;i<N;i++)
+  {
+    w(i) = min(u(i),v(i));
+  }
+
+  return w;
+}
+
+template<int N,typename T> Vec<N,T>
+inline max(const Vec<N,T>& u,const Vec<N,T>& v)
+{
+  assert(N>0);
+
+  Vec<N,T> w;
+
+  for(int i=0;i<N;i++)
+  {
+    w(i) = max(u(i),v(i));
+  }
+
+  return w;
+}
+}
+
+template<int N,typename T> Vec<N,T>
+inline abs(const Vec<N,T>& x)
+{
+  Vec<N,T> out;
+  for(int i=0;i<N;i++) out(i) = abs(x(i));
+  return out;
+}
+
+#define fori(I) for (int i=0;i<(I);i++)
+#define forj(J) for (int j=0;j<(J);j++)
+#define fork(K) for (int k=0;k<(K);k++)
+#define forij(I,J) for (int i=0;i<(I);i++) for (int j=0;j<(J);j++)
+
+template<int M,int N,typename T>
+Mat<M,N,T>::Mat() {}
+
+template<int M,int N,typename T>
+Mat<M,N,T>::Mat(T a00,T a01,
+                T a10,T a11)
+{
+  assert(M==2 && N==2);
+
+  m[0][0] = a00; m[0][1] = a01;
+  m[1][0] = a10; m[1][1] = a11;
+}
+
+template<int M,int N,typename T>
+Mat<M,N,T>::Mat(T a00,T a01,T a02,
+                T a10,T a11,T a12,
+                T a20,T a21,T a22)
+{
+  assert(M==3 && N==3);
+
+  m[0][0] = a00; m[0][1] = a01; m[0][2] = a02;
+  m[1][0] = a10; m[1][1] = a11; m[1][2] = a12;
+  m[2][0] = a20; m[2][1] = a21; m[2][2] = a22;
+}
+
+template<int M,int N,typename T>
+Mat<M,N,T>::Mat(T a00,T a01,T a02,T a03,
+                T a10,T a11,T a12,T a13,
+                T a20,T a21,T a22,T a23,
+                T a30,T a31,T a32,T a33)
+{
+  assert(M==4 && N==4);
+
+  m[0][0] = a00; m[0][1] = a01; m[0][2] = a02; m[0][3] = a03;
+  m[1][0] = a10; m[1][1] = a11; m[1][2] = a12; m[1][3] = a13;
+  m[2][0] = a20; m[2][1] = a21; m[2][2] = a22; m[2][3] = a23;
+  m[3][0] = a30; m[3][1] = a31; m[3][2] = a32; m[3][3] = a33;
+}
+
+template<int M,int N,typename T>
+T& Mat<M,N,T>::operator()(int i,int j)
+{
+  assert(0<=i && i<M);
+  assert(0<=j && j<N);
+  return m[i][j];
+}
+
+template<int M,int N,typename T>
+const T& Mat<M,N,T>::operator()(int i,int j) const
+{
+  assert(0<=i && i<M);
+  assert(0<=j && j<N);
+  return m[i][j];
+}
+
+template<int M,int N,typename T>
+T* Mat<M,N,T>::data()
+{
+  return (T*)(&m[0][0]);
+}
+
+template<int M,int N,typename T>
+const T* Mat<M,N,T>::data() const
+{
+  return (T*)(&m[0][0]);
+}
+
+template<int M1,int N1,int M2,int N2,typename T>
+Mat<M1,N2,T> operator*(const Mat<M1,N1,T>& A,const Mat<M2,N2,T>& B)
+{
+  assert(N1==M2);
+  Mat<M1,N2,T> C;
+
+  fori(M1)
+  forj(N2)
+  {
+    T dot = 0;
+    fork(N1) dot += A(i,k) * B(k,j);
+    C(i,j) = dot;
+  }
+
+  return C;
+}
+
+template<int M,int N,typename T>
+Vec<M,T> operator*(const Mat<M,N,T>& A,const Vec<N,T>& u)
+{
+  Vec<M,T> v;
+
+  fori(M)
+  {
+    T dot = 0;
+    forj(N) dot += A(i,j) * u(j);
+    v(i) = dot;
+  }
+
+  return v;
+}
+
+template<int M,int N,typename T>
+Vec<N,T> operator*(const Vec<M,T>& u,const Mat<M,N,T>& A)
+{
+  Vec<N,T> v;
+
+  forj(N)
+  {
+    T dot = 0;
+    fori(M) dot += A(i,j) * u(i);
+    v(j) = dot;
+  }
+
+  return v;
+}
+
+/*
+template<int N, class T>
+Mat<N,N,T> identity()
+{
+    Mat<N,N,T> A;
+    forij(N,N) A(i,j) = ((i==j) ? 1 : 0);
+    return A;
+}
+*/
+
+template<int M,int N,typename T>
+Mat<N,M,T> transpose(const Mat<M,N,T>& A)
+{
+  Mat<N,M,T> At;
+
+  forij(N,M) At(i,j) = A(j,i);
+
+  return At;
+}
+
+template<int N,typename T>
+T trace(const Mat<N,N,T>& A)
+{
+  T sum = 0;
+
+  fori(N) sum += A(i,i);
+
+  return sum;
+}
+
+template<int N,typename T>
+Mat<N,N,T> inverse(const Mat<N,N,T>& A)
+{
+  Mat<N,N,T> invA;
+
+  invA = A;
+
+  Vec<N,int> colIndex;
+  Vec<N,int> rowIndex;
+  Vec<N,bool> pivoted;
+
+  fori(N) pivoted(i) = false;
+
+  int i1, i2, row = 0, col = 0;
+  T save;
+
+  for (int i0 = 0; i0 < N; i0++)
+  {
+    T fMax = 0.0f;
+    for (i1 = 0; i1 < N; i1++)
+    {
+      if (!pivoted(i1))
+      {
+        for (i2 = 0; i2 < N; i2++)
+        {
+          if (!pivoted(i2))
+          {
+            T fs = abs(invA(i1,i2));
+            if (fs > fMax)
+            {
+              fMax = fs;
+              row = i1;
+              col = i2;
+            }
+          }
+        }
+      }
+    }
+
+    //assert(fmax > eps)
+
+    pivoted(col) = true;
+
+    if (row != col)
+    {
+        forj(N) { T tmp = invA(row,j); invA(row,j) = invA(col,j); invA(col,j) = tmp; }
+    }
+
+    rowIndex(i0) = row;
+    colIndex(i0) = col;
+
+    T inv = ((T)1.0)/invA(col,col);
+    invA(col,col) = (T)1.0;
+    for (i2 = 0; i2 < N; i2++)
+    {
+      invA(col,i2) *= inv;
+    }
+
+    for (i1 = 0; i1 < N; i1++)
+    {
+      if (i1 != col)
+      {
+        save = invA(i1,col);
+        invA(i1,col) = (T)0.0;
+        for (i2 = 0; i2 < N; i2++)
+        {
+          invA(i1,i2) -= invA(col,i2)*save;
+        }
+      }
+    }
+  }
+
+  for (i1 = N-1; i1 >= 0; i1--)
+  {
+    if (rowIndex(i1) != colIndex(i1))
+    {
+      for (i2 = 0; i2 < N; i2++)
+      {
+        save = invA(i2,rowIndex(i1));
+        invA(i2,rowIndex(i1)) = invA(i2,colIndex(i1));
+        invA(i2,colIndex(i1)) = save;
+      }
+    }
+  }
+
+  return invA;
+}
+
+#undef fori
+#undef forj
+#undef fork
+
+template<typename T>
+Array2<T>::Array2() : s(0,0),d(0) {}
+
+template<typename T>
+Array2<T>::Array2(int width,int height)
+{
+  assert(width>0 && height>0);
+  s = Vec2i(width,height);
+  d = new T[s(0)*s(1)];
+}
+
+template<typename T>
+Array2<T>::Array2(const Vec2i& size)
+{
+  // XXX: predelat na neco jako assert(all(s>0));
+  assert(size(0)>0 && size(1)>0);
+  s = size;
+  d = new T[s(0)*s(1)];
+}
+
+template<typename T>
+Array2<T>::Array2(const Array2<T>& a)
+{
+  //  printf("COPY CONSTRUCTOR\n");
+  s = a.s;
+
+  if (s(0)>0 && s(1)>0)
+  {
+    d = new T[s(0)*s(1)];
+
+    // XXX: optimize this:
+    for(int i=0;i<s(0)*s(1);i++) d[i] = a.d[i];
+  }
+  else
+  {
+    d = 0;
+  }
+}
+
+template<typename T>
+Array2<T>& Array2<T>::operator=(const Array2<T>& a)
+{
+  // printf("ASSIGNMENT\n");
+  // printf("slow copy\n");
+  if (this!=&a)
+  {
+    if (s(0)==a.s(0) && s(1)==a.s(1))
+    {
+      // XXX: optimize this:
+      for(int i=0;i<s(0)*s(1);i++) d[i] = a.d[i];
+      //memcpy(d,a.d,numel()*sizeof(T)); //XXX this will break down when T is not POD !!!
+    }
+    else
+    {
+      delete[] d;
+      s = a.s;
+
+      if (a.s(0)>0 && a.s(1)>0)
+      {
+        d = new T[s(0)*s(1)];
+        //memcpy(d,a.d,numel()*sizeof(T)); //XXX this will break down when T is not POD !!!
+        // XXX: optimize this:
+        for(int i=0;i<s(0)*s(1);i++) d[i] = a.d[i];
+      }
+      else
+      {
+        d = 0;
+      }
+    }
+  }
+  else
+  {
+  //  printf("SELF ASSIGNMENT\n");
+  }
+
+  return *this;
+}
+
+template<typename T>
+Array2<T>::~Array2()
+{
+  delete[] d;
+}
+
+template<typename T>
+inline T& Array2<T>::operator[](int i)
+{
+  assert(i>=0 && i<numel());
+
+  return d[i];
+}
+
+template<typename T>
+inline const T& Array2<T>::operator[](int i) const
+{
+  assert(i>=0 && i<numel());
+
+  return d[i];
+}
+
+template<typename T>
+inline T& Array2<T>::operator()(int i,int j)
+{
+  assert(d!=0);
+  assert(i>=0 && i<s(0) &&
+         j>=0 && j<s(1));
+
+  return d[i+j*s(0)];
+}
+
+template<typename T>
+inline const T& Array2<T>::operator()(int i,int j) const
+{
+  assert(d!=0);
+  assert(i>=0 && i<s(0) &&
+         j>=0 && j<s(1));
+
+  return d[i+j*s(0)];
+}
+
+template<typename T>
+inline T& Array2<T>::operator()(const Vec<2,int>& ij)
+{
+  assert(d!=0);
+  assert(ij(0)>=0 && ij(0)<s(0) &&
+         ij(1)>=0 && ij(1)<s(1));
+
+  return d[ij(0)+ij(1)*s(0)];
+}
+
+template<typename T>
+inline const T& Array2<T>::operator()(const Vec<2,int>& ij) const
+{
+  assert(d!=0);
+  assert(ij(0)>=0 && ij(0)<s(0) &&
+         ij(1)>=0 && ij(1)<s(1));
+
+  return d[ij(0)+ij(1)*s(0)];
+}
+
+template<typename T>
+Vec2i Array2<T>::size() const
+{
+  return s;
+}
+
+template<typename T>
+int Array2<T>::size(int dim) const
+{
+  assert(dim==0 || dim==1);
+  return size()(dim);
+}
+
+template<typename T>
+int Array2<T>::width() const
+{
+  return size(0);
+}
+
+template<typename T>
+int Array2<T>::height() const
+{
+  return size(1);
+}
+
+template<typename T>
+int Array2<T>::numel() const
+{
+  return size(0)*size(1);
+}
+
+template<typename T>
+T* Array2<T>::data()
+{
+  return d;
+}
+
+template<typename T>
+const T* Array2<T>::data() const
+{
+  return d;
+}
+
+template<typename T>
+bool Array2<T>::empty() const
+{
+  return (numel()==0);
+}
+
+template<typename T>
+void Array2<T>::clear()
+{
+  delete[] d;
+  s = Vec2i(0,0);
+  d = 0;
+}
+
+template<typename T>
+void Array2<T>::swap(Array2<T>& b)
+{
+  Vec2i tmp_s = s;
+  s = b.s;
+  b.s = tmp_s;
+
+  T* tmp_d = d;
+  d = b.d;
+  b.d = tmp_d;
+}
+
+template<typename T>
+Vec2i size(const Array2<T>& a)
+{
+  return a.size();
+}
+
+template<typename T>
+int size(const Array2<T>& a,int dim)
+{
+  return a.size(dim);
+}
+
+template<typename T>
+int numel(const Array2<T>& a)
+{
+  return a.numel();
+}
+
+template<typename T>
+void clear(Array2<T>* a)
+{
+  a->clear();
+}
+
+template<typename T>
+void swap(Array2<T>& a,Array2<T>& b)
+{
+  a.swap(b);
+}
+
+template<typename T>
+T min(const Array2<T>& a)
+{
+  assert(numel(a)>0);
+
+  const int n = numel(a);
+
+  const T* d = a.data();
+
+  T minval = d[0];
+
+  for(int i=1;i<n;i++) minval = (d[i]<minval) ? d[i] : minval;
+
+  return minval;
+}
+
+template<typename T>
+T max(const Array2<T>& a)
+{
+  assert(numel(a)>0);
+
+  const int n = numel(a);
+
+  const T* d = a.data();
+
+  T maxval = d[0];
+
+  for(int i=1;i<n;i++) maxval = (maxval<d[i]) ? d[i] : maxval;
+
+  return maxval;
+}
+
+template<typename T>
+Vec<2,T> minmax(const Array2<T>& a)
+{
+  assert(numel(a)>0);
+
+  const int n = numel(a);
+
+  const T* d = a.data();
+
+  T minval = d[0];
+  T maxval = d[0];
+
+  for(int i=1;i<n;i++)
+  {
+    minval = (d[i]<minval) ? d[i] : minval;
+    maxval = (maxval<d[i]) ? d[i] : maxval;
+  }
+
+  return Vec<2,T>(minval,maxval);
+}
+
+template<typename T>
+Vec2i argmin(const Array2<T>& a)
+{
+  assert(numel(a)>0);
+
+  T minValue = a(0,0);
+  Vec2i minIndex = Vec2i(0,0);
+
+  for(int j=0;j<a.height();j++)
+  {
+    for(int i=0;i<a.width();i++)
+    {
+      if (a(i,j)<minValue)
+      {
+        minValue = a(i,j);
+        minIndex = Vec2i(i,j);
+      }
+    }
+  }
+
+  return minIndex;
+}
+
+template<typename T>
+Vec2i argmax(const Array2<T>& a)
+{
+  assert(numel(a)>0);
+
+  T maxValue = a(0,0);
+  Vec2i maxIndex = Vec2i(0,0);
+
+  for(int j=0;j<a.height();j++)
+  {
+    for(int i=0;i<a.width();i++)
+    {
+      if (maxValue<a(i,j))
+      {
+        maxValue = a(i,j);
+        maxIndex = Vec2i(i,j);
+      }
+    }
+  }
+
+  return maxIndex;
+}
+
+template<typename T>
+T sum(const Array2<T>& a)
+{
+  assert(numel(a)>0);
+
+  const int n = numel(a);
+
+  const T* d = a.data();
+
+  T sumval = d[0];
+
+  for(int i=1;i<n;i++) sumval += d[i];
+
+  return sumval;
+}
+
+template<typename T>
+void fill(Array2<T>* a,const T& value)
+{
+  assert(a!=0);
+  assert(a->numel()>0);
+
+  const int n = a->numel();
+  T* d = a->data();
+
+  for(int i=0;i<n;i++) d[i] = value;
+}
+
+template<typename T,typename F>
+Array2<T> apply(const Array2<T>& a,F fun)
+{
+  assert(numel(a) > 0);
+
+  Array2<T> fun_a(size(a));
+
+  const int n = numel(a);
+
+  for(int i=0;i<n;i++) fun_a.data()[i] = fun(a.data()[i]);
+
+  return fun_a;
+}
+
+template<typename T>
+Array3<T>::Array3() : s(0,0,0),d(0) {}
+
+template<typename T>
+Array3<T>::Array3(int width,int height,int depth)
+{
+  assert(width>0 && height>0 && depth>0);
+  s = Vec3i(width,height,depth);
+  d = new T[s(0)*s(1)*s(2)];
+}
+
+template<typename T>
+Array3<T>::Array3(const Vec3i& size)
+{
+  // XXX: predelat na neco jako assert(all(s>0));
+  assert(size(0)>0 && size(1)>0 && size(2)>0);
+  s = size;
+  d = new T[s(0)*s(1)*s(2)];
+}
+
+template<typename T>
+Array3<T>::Array3(const Array3<T>& a)
+{
+  //  printf("COPY CONSTRUCTOR\n");
+  s = a.s;
+
+  if (s(0)>0 && s(1)>0 && s(2)>0)
+  {
+    d = new T[s(0)*s(1)*s(2)];
+
+    // XXX: optimize this:
+    for(int i=0;i<s(0)*s(1)*s(2);i++) d[i] = a.d[i];
+    //memcpy((void *)d, (void *)a.d, sizeof(T)*s(0)*s(1)*s(2));
+  }
+  else
+  {
+    d = 0;
+  }
+}
+
+template<typename T>
+Array3<T>& Array3<T>::operator=(const Array3<T>& a)
+{
+  // printf("ASSIGNMENT\n");
+  // printf("slow copy\n");
+  if (this!=&a)
+  {
+    if (s(0)==a.s(0) && s(1)==a.s(1) && s(2)==a.s(2))
+    {
+      // XXX: optimize this:
+      for(int i=0;i<s(0)*s(1)*s(2);i++) d[i] = a.d[i];
+      //memcpy((void *)d, (void *)a.d, sizeof(T)*s(0)*s(1)*s(2));
+    }
+    else
+    {
+      delete[] d;
+      s = a.s;
+
+      if (a.s(0)>0 && a.s(1)>0 && a.s(2)>0)
+      {
+        d = new T[s(0)*s(1)*s(2)];
+        // XXX: optimize this:
+        for(int i=0;i<s(0)*s(1)*s(2);i++) d[i] = a.d[i];
+        //memcpy((void *)d, (void *)a.d, sizeof(T)*s(0)*s(1)*s(2));
+      }
+      else
+      {
+        d = 0;
+      }
+    }
+  }
+  else
+  {
+  //  printf("SELF ASSIGNMENT\n");
+  }
+
+  return *this;
+}
+
+template<typename T>
+Array3<T>::~Array3()
+{
+  delete[] d;
+}
+
+template<typename T>
+inline T& Array3<T>::operator[](int i)
+{
+  assert(i>=0 && i<numel());
+
+  return d[i];
+}
+
+template<typename T>
+inline const T& Array3<T>::operator[](int i) const
+{
+  assert(i>=0 && i<numel());
+
+  return d[i];
+}
+
+template<typename T>
+inline T& Array3<T>::operator()(int i,int j,int k)
+{
+  assert(d!=0);
+  assert(i>=0 && i<s(0) &&
+         j>=0 && j<s(1) &&
+         k>=0 && k<s(2));
+
+  return d[i+(j+k*s(1))*s(0)];
+}
+
+template<typename T>
+inline const T& Array3<T>::operator()(int i,int j,int k) const
+{
+  assert(d!=0);
+  assert(i>=0 && i<s(0) &&
+         j>=0 && j<s(1) &&
+         k>=0 && k<s(2));
+
+  return d[i+(j+k*s(1))*s(0)];
+}
+
+template<typename T>
+inline T& Array3<T>::operator()(const Vec<3,int>& ijk)
+{
+  assert(d!=0);
+  assert(ijk(0)>=0 && ijk(0)<s(0) &&
+         ijk(1)>=0 && ijk(1)<s(1) &&
+         ijk(2)>=0 && ijk(2)<s(2));
+
+  return d[ijk(0)+(ijk(1)+ijk(2)*s(1))*s(0)];
+}
+
+template<typename T>
+inline const T& Array3<T>::operator()(const Vec<3,int>& ijk) const
+{
+  assert(d!=0);
+  assert(ijk(0)>=0 && ijk(0)<s(0) &&
+         ijk(1)>=0 && ijk(1)<s(1) &&
+         ijk(2)>=0 && ijk(2)<s(2));
+
+  return d[ijk(0)+(ijk(1)+ijk(2)*s(1))*s(0)];
+}
+
+template<typename T>
+Vec3i Array3<T>::size() const
+{
+  return s;
+}
+
+template<typename T>
+int Array3<T>::size(int dim) const
+{
+  assert(dim==0 || dim==1 || dim==2);
+  return size()(dim);
+}
+
+template<typename T>
+int Array3<T>::width() const
+{
+  return size(0);
+}
+
+template<typename T>
+int Array3<T>::height() const
+{
+  return size(1);
+}
+
+template<typename T>
+int Array3<T>::depth() const
+{
+  return size(2);
+}
+
+template<typename T>
+int Array3<T>::numel() const
+{
+  return size(0)*size(1)*size(2);
+}
+
+template<typename T>
+T* Array3<T>::data()
+{
+  return d;
+}
+
+template<typename T>
+const T* Array3<T>::data() const
+{
+  return d;
+}
+
+template<typename T>
+void Array3<T>::clear()
+{
+  delete[] d;
+  s = Vec3i(0,0,0);
+  d = 0;
+}
+
+template<typename T>
+void Array3<T>::swap(Array3<T>& b)
+{
+  Vec3i tmp_s = s;
+  s = b.s;
+  b.s = tmp_s;
+
+  T* tmp_d = d;
+  d = b.d;
+  b.d = tmp_d;
+}
+
+template<typename T>
+bool Array3<T>::empty() const
+{
+  return (numel()==0);
+}
+
+template<typename T>
+Vec3i size(const Array3<T>& a)
+{
+  return a.size();
+}
+
+template<typename T>
+int size(const Array3<T>& a,int dim)
+{
+  return a.size(dim);
+}
+
+template<typename T>
+int numel(const Array3<T>& a)
+{
+  return a.numel();
+}
+
+template<typename T>
+void clear(Array3<T>* a)
+{
+  a->clear();
+}
+
+template<typename T>
+void swap(Array3<T>& a,Array3<T>& b)
+{
+  a.swap(b);
+}
+
+template<typename T>
+void fill(Array3<T>* a,const T& value)
+{
+  assert(a!=0);
+  assert(a->numel()>0);
+
+  const int n = a->numel();
+  T* d = a->data();
+
+  for(int i=0;i<n;i++) d[i] = value;
+}
+
+template<typename T>
+Array3<T> a3read(const std::string& fileName)
+{
+  Array3<T> A;
+  if(!a3read(&A,fileName)) { return Array3<T>(); }
+  return A;
+}
+
+template<typename T>
+bool a3read(Array3<T>* out_A,const std::string& fileName)
+{
+  FILE* f = fopen(fileName.c_str(),"rb");
+
+  if(!f) { return false; }
+
+  int w,h,d,s;
+
+  if(fread(&w,sizeof(w),1,f)!=1 ||
+     fread(&h,sizeof(h),1,f)!=1 ||
+     fread(&d,sizeof(d),1,f)!=1 ||
+     fread(&s,sizeof(s),1,f)!=1 ||
+     ((w*h*d)<1) || s!=sizeof(T))
+  {
+    fclose(f);
+    return false;
+  }
+
+  Array3<T> A(w,h,d);
+
+  if(fread(A.data(),sizeof(T)*w*h*d,1,f)!=1)
+  {
+    fclose(f);
+    return false;
+  }
+
+  if(out_A!=0) { *out_A = A; }
+
+  fclose(f);
+  return true;
+}
+
+template<typename T>
+bool a3write(const Array3<T>& A,const std::string& fileName)
+{
+  if(A.numel()==0) { return false; }
+
+  FILE* f = fopen(fileName.c_str(),"wb");
+
+  if(!f) { return false; }
+
+  const int w = A.width();
+  const int h = A.height();
+  const int d = A.depth();
+  const int s = sizeof(T);
+
+  if(fwrite(&w,sizeof(w),1,f)!=1 ||
+     fwrite(&h,sizeof(h),1,f)!=1 ||
+     fwrite(&d,sizeof(d),1,f)!=1 ||
+     fwrite(&s,sizeof(s),1,f)!=1 ||
+     fwrite(A.data(),sizeof(T)*w*h*d,1,f)!=1)
+  {
+    fclose(f);
+    return false;
+  }
+
+  fclose(f);
+  return true;
+}
+#endif
diff --git a/src/patchmatch_gpu.h b/src/patchmatch_gpu.h
deleted file mode 100644
index a1adaf1..0000000
--- a/src/patchmatch_gpu.h
+++ /dev/null
@@ -1,410 +0,0 @@
-// This software is in the public domain. Where that dedication is not
-// recognized, you are granted a perpetual, irrevocable license to copy
-// and modify this file as you see fit.
-
-#ifndef PATCHMATCH_GPU_H_
-#define PATCHMATCH_GPU_H_
-
-#include <stdint.h>
-#include <cfloat>
-
-#include "texarray2.h"
-#include "memarray2.h"
-
-struct pcgState
-{
-  uint64_t state;
-  uint64_t increment;
-};
-
-__device__ void pcgAdvance(pcgState* rng)
-{
-  rng->state = rng->state * 6364136223846793005ULL + rng->increment;
-}
-
-__device__ uint32_t pcgOutput(uint64_t state)
-{
-  return (uint32_t)(((state >> 22u) ^ state) >> ((state >> 61u) + 22u));
-}
-
-__device__ uint32_t pcgRand(pcgState* rng)
-{
-  uint64_t oldstate = rng->state;
-  pcgAdvance(rng);
-  return pcgOutput(oldstate);
-}
-
-__device__ void pcgInit(pcgState* rng,uint64_t seed,uint64_t stream)
-{
-  rng->state = 0U;
-  rng->increment = (stream << 1u) | 1u;
-  pcgAdvance(rng);
-  rng->state += seed;
-  pcgAdvance(rng);
-}
-
-typedef Vec<1,float> V1f;
-typedef Array2<Vec<1,float>> A2V1f;
-
-__global__ void krnlInitRngStates(const int width,
-                                  const int height,
-                                  pcgState* rngStates)
-{
-  const int x = blockDim.x*blockIdx.x + threadIdx.x;
-  const int y = blockDim.y*blockIdx.y + threadIdx.y;
-
-  if (x<width && y<height)
-  {
-    const int idx = x+y*width;
-    pcgInit(&rngStates[idx],1337,idx);    
-  }
-}
-
-pcgState* initGpuRng(const int width,
-                     const int height)
-{
-  pcgState* gpuRngStates;
-  cudaMalloc(&gpuRngStates,width*height*sizeof(pcgState));
-
-  const dim3 threadsPerBlock(16,16);
-  const dim3 numBlocks((width+threadsPerBlock.x)/threadsPerBlock.x,
-                       (height+threadsPerBlock.y)/threadsPerBlock.y);
-
-  krnlInitRngStates<<<numBlocks,threadsPerBlock>>>(width,height,gpuRngStates);
-
-  return gpuRngStates;
-}
-
-template<int N,typename T,int M>
-struct PatchSSD
-{
-  const TexArray2<N,T,M> A;
-  const TexArray2<N,T,M> B;
-  const Vec<N,float> weights;
-
-  PatchSSD(const TexArray2<N,T,M>& A,
-           const TexArray2<N,T,M>& B,
-           const Vec<N,float>& weights)
-
-  : A(A),B(B),weights(weights) {}
-
-   __device__ float operator()(int patchWidth,
-                               const int ax,
-                               const int ay,
-                               const int bx,
-                               const int by,
-                               const float ebest)
-   {
-    const int hpw = patchWidth/2;
-    float ssd = 0;
-
-    for(int py=-hpw;py<=+hpw;py++)
-    {
-      for(int px=-hpw;px<=+hpw;px++)
-      {
-        const Vec<N,T> pixelA = A(ax + px, ay + py);
-        const Vec<N,T> pixelB = B(bx + px, by + py);
-        for(int i=0;i<N;i++)
-        {
-          const float diff = float(pixelA[i])-float(pixelB[i]);
-          ssd += weights[i]*diff*diff;
-        }
-      }
-
-      if (ssd>ebest) { return ssd; }
-    }
-
-    return ssd;
-   }
-};
-
-template<typename FUNC>
-__global__ void krnlEvalErrorPass(const int patchWidth,
-                                  FUNC patchError,
-                                  const TexArray2<2,int> NNF,
-                                  TexArray2<1,float> E)
-{
-  const int x = blockDim.x*blockIdx.x + threadIdx.x;
-  const int y = blockDim.y*blockIdx.y + threadIdx.y;
-
-  if (x<NNF.width && y<NNF.height)
-  {
-    const V2i n = NNF(x,y);
-    E.write(x,y,V1f(patchError(patchWidth,x,y,n[0],n[1],FLT_MAX)));
-  }
-}
-
-void __device__ updateOmega(MemArray2<int>& Omega,const int patchWidth,const int bx,const int by,const int incdec)
-{
-  const int r = patchWidth/2;
-
-  for(int oy=-r;oy<=+r;oy++)
-  for(int ox=-r;ox<=+r;ox++)
-  {
-    const int x = bx+ox;
-    const int y = by+oy;
-    atomicAdd(&Omega.data[x+y*Omega.width],incdec);
-    //Omega.data[x+y*Omega.width] += incdec;
-  }
-}
-
-int __device__ patchOmega(const int patchWidth,const int bx,const int by,const MemArray2<int>& Omega)
-{
-  const int r = patchWidth/2;
-
-  int sum = 0;
-
-  for(int oy=-r;oy<=+r;oy++)
-  for(int ox=-r;ox<=+r;ox++)
-  {
-    const int x = bx+ox;
-    const int y = by+oy;
-    sum += Omega.data[x+y*Omega.width]; /// XXX: atomic read instead ??
-  }
-
-  return sum;
-}
-
-template<typename FUNC>
-__device__ void tryPatch(const  V2i& sizeA,
-                         const  V2i& sizeB,
-                                MemArray2<int>& Omega,
-                         const  int patchWidth,
-                         FUNC   patchError,
-                         const  float lambda,
-                         const  int ax,
-                         const  int ay,
-                         const  int bx,
-                         const  int by,
-                         V2i&   nbest,
-                         float& ebest)
-{
-  const float omegaBest = (float(sizeA(0)*sizeA(1)) /
-                           float(sizeB(0)*sizeB(1))) * float(patchWidth*patchWidth);
-
-  const float curOcc = (float(patchOmega(patchWidth,nbest(0),nbest(1),Omega))/float(patchWidth*patchWidth))/omegaBest;
-  const float newOcc = (float(patchOmega(patchWidth,      bx,      by,Omega))/float(patchWidth*patchWidth))/omegaBest;
-
-  const float curErr = ebest;
-  const float newErr = patchError(patchWidth,ax,ay,bx,by,curErr+lambda*curOcc);
-
-  if ((newErr+lambda*newOcc) < (curErr+lambda*curOcc))
-  {
-    updateOmega(Omega,patchWidth,      bx,      by,+1);
-    updateOmega(Omega,patchWidth,nbest(0),nbest(1),-1);
-    nbest = V2i(bx,by);
-    ebest = newErr;
-  }
-}
-
-template<typename FUNC>
-__device__ void tryNeighborsOffset(const int x,
-                                   const int y,
-                                   const int ox,
-                                   const int oy,
-                                   V2i& nbest,
-                                   float& ebest,
-                                   const V2i& sizeA,
-                                   const V2i& sizeB,
-                                         MemArray2<int>& Omega,
-                                   const int patchWidth,
-                                   FUNC patchError,
-                                   const float lambda,
-                                   const TexArray2<2,int>& NNF)
-{
-  const int hpw = patchWidth/2;
-
-  const V2i on = NNF(x+ox,y+oy);
-  const int nx = on(0)-ox;
-  const int ny = on(1)-oy;
-
-  if (nx>=hpw && nx<sizeB(0)-hpw &&
-      ny>=hpw && ny<sizeB(1)-hpw)
-  {
-    tryPatch(sizeA,sizeB,Omega,patchWidth,patchError,lambda,x,y,nx,ny,nbest,ebest);
-  }
-}
-
-template<typename FUNC>
-__global__ void krnlPropagationPass(const V2i sizeA,
-                                    const V2i sizeB,
-                                          MemArray2<int> Omega,
-                                    const int patchWidth,
-                                    FUNC  patchError,
-                                    const float lambda,
-                                    const int r,
-                                    const TexArray2<2,int> NNF,
-                                    TexArray2<2,int> NNF2,
-                                    TexArray2<1,float> E,
-                                    TexArray2<1,unsigned char> mask)
-{
-  const int x = blockDim.x*blockIdx.x + threadIdx.x;
-  const int y = blockDim.y*blockIdx.y + threadIdx.y;
-
-  if (x<sizeA(0) && y<sizeA(1))
-  {
-    V2i   nbest = NNF(x,y);
-    float ebest = E(x,y)(0);
-
-    if (mask(x,y)[0]==255)
-    {
-      tryNeighborsOffset(x,y,-r,0,nbest,ebest,sizeA,sizeB,Omega,patchWidth,patchError,lambda,NNF);
-      tryNeighborsOffset(x,y,+r,0,nbest,ebest,sizeA,sizeB,Omega,patchWidth,patchError,lambda,NNF);
-      tryNeighborsOffset(x,y,0,-r,nbest,ebest,sizeA,sizeB,Omega,patchWidth,patchError,lambda,NNF);
-      tryNeighborsOffset(x,y,0,+r,nbest,ebest,sizeA,sizeB,Omega,patchWidth,patchError,lambda,NNF);
-    }
-
-    E.write(x,y,V1f(ebest));
-    NNF2.write(x,y,nbest);
-  }
-}
-
-template<typename FUNC>
-__device__ void tryRandomOffsetInRadius(const int r,
-                                        const V2i& sizeA,
-                                        const V2i& sizeB,
-                                              MemArray2<int>& Omega,
-                                        const int patchWidth,
-                                        FUNC  patchError,
-                                        const float lambda,
-                                        const int x,
-                                        const int y,
-                                        const V2i& norg,
-                                        V2i&  nbest,
-                                        float& ebest,
-                                        pcgState* rngState)
-{
-  const int hpw = patchWidth/2;
-
-  const int xmin = max(norg(0)-r,hpw);
-  const int xmax = min(norg(0)+r,sizeB(0)-1-hpw);
-  const int ymin = max(norg(1)-r,hpw);
-  const int ymax = min(norg(1)+r,sizeB(1)-1-hpw);
-
-  const int nx = xmin+(pcgRand(rngState)%(xmax-xmin+1));
-  const int ny = ymin+(pcgRand(rngState)%(ymax-ymin+1));
-
-  tryPatch(sizeA,sizeB,Omega,patchWidth,patchError,lambda,x,y,nx,ny,nbest,ebest);
-}
-
-/*
-template<typename FUNC>
-__global__ void krnlRandomSearchPass(const V2i sizeA,
-                                     const V2i sizeB,
-                                     MemArray2<int> Omega,
-                                     const int patchWidth,
-                                     FUNC  patchError,
-                                     const float lambda,
-                                     TexArray2<2,int> NNF,
-                                     TexArray2<1,float> E,
-                                     TexArray2<1,unsigned char> mask,
-                                     pcgState* rngStates)
-{
-  const int x = blockDim.x*blockIdx.x + threadIdx.x;
-  const int y = blockDim.y*blockIdx.y + threadIdx.y;
-
-  if (x<sizeA(0) && y<sizeA(1))
-  {
-    if (mask(x,y)[0]==255)
-    {
-      V2i nbest = NNF(x,y);
-      float ebest = E(x,y)(0);
-
-      const V2i norg = nbest;
-
-      for(int r=1;r<max(sizeB(0),sizeB(1))/2;r=r*2)
-      {
-        tryRandomOffsetInRadius(r,sizeA,sizeB,Omega,patchWidth,patchError,lambda,x,y,norg,nbest,ebest,&rngStates[x+y*NNF.width]);
-      }
-
-      E.write(x,y,V1f(ebest));
-      NNF.write(x,y,nbest);
-    }
-  }
-}
-*/
-
-template<typename FUNC>
-__global__ void krnlRandomSearchPass(const V2i sizeA,
-                                     const V2i sizeB,
-                                     MemArray2<int> Omega,
-                                     const int patchWidth,
-                                     FUNC  patchError,
-                                     const float lambda,
-                                     const int radius,
-                                     TexArray2<2,int> NNF,
-                                     TexArray2<1,float> E,
-                                     TexArray2<1,unsigned char> mask,
-                                     pcgState* rngStates)
-{
-  const int x = blockDim.x*blockIdx.x + threadIdx.x;
-  const int y = blockDim.y*blockIdx.y + threadIdx.y;
-
-  if (x<sizeA(0) && y<sizeA(1))
-  {
-    if (mask(x,y)[0]==255)
-    {
-      V2i nbest = NNF(x,y);
-      float ebest = E(x,y)(0);
-
-      const V2i norg = nbest;
-
-      tryRandomOffsetInRadius(radius,sizeA,sizeB,Omega,patchWidth,patchError,lambda,x,y,norg,nbest,ebest,&rngStates[x+y*NNF.width]);
-
-      E.write(x,y,V1f(ebest));
-      NNF.write(x,y,nbest);
-    }
-  }
-}
-
-template<typename FUNC>
-void patchmatchGPU(const V2i sizeA,
-                   const V2i sizeB,
-                   MemArray2<int>& Omega,
-                   const int patchWidth,
-                   FUNC patchError,
-                   const float lambda,
-                   const int numIters,
-                   const int numThreadsPerBlock,
-                   TexArray2<2,int>& NNF,
-                   TexArray2<2,int>& NNF2,
-                   TexArray2<1,float>& E,
-                   TexArray2<1,unsigned char>& mask,
-                   pcgState* rngStates)
-{
-  const dim3 threadsPerBlock = dim3(numThreadsPerBlock,numThreadsPerBlock);
-  const dim3 numBlocks = dim3((NNF.width+threadsPerBlock.x)/threadsPerBlock.x,
-                              (NNF.height+threadsPerBlock.y)/threadsPerBlock.y);
-
-  krnlEvalErrorPass<<<numBlocks,threadsPerBlock>>>(patchWidth,patchError,NNF,E);
-
-  checkCudaError(cudaDeviceSynchronize());
-
-  for(int i=0;i<numIters;i++)
-  {
-    krnlPropagationPass<<<numBlocks,threadsPerBlock>>>(sizeA,sizeB,Omega,patchWidth,patchError,lambda,4,NNF,NNF2,E,mask); std::swap(NNF,NNF2);
-
-    checkCudaError(cudaDeviceSynchronize());
-
-    krnlPropagationPass<<<numBlocks,threadsPerBlock>>>(sizeA,sizeB,Omega,patchWidth,patchError,lambda,2,NNF,NNF2,E,mask); std::swap(NNF,NNF2);
-
-    checkCudaError(cudaDeviceSynchronize());
-
-    krnlPropagationPass<<<numBlocks,threadsPerBlock>>>(sizeA,sizeB,Omega,patchWidth,patchError,lambda,1,NNF,NNF2,E,mask); std::swap(NNF,NNF2);
-
-    checkCudaError(cudaDeviceSynchronize());
-
-    for(int r=1;r<max(sizeB(0),sizeB(1))/2;r=r*2)
-    {
-      krnlRandomSearchPass<<<numBlocks,threadsPerBlock>>>(sizeA,sizeB,Omega,patchWidth,patchError,lambda,r,NNF,E,mask,rngStates);
-    }
-
-    checkCudaError(cudaDeviceSynchronize());
-  }
-
-  krnlEvalErrorPass<<<numBlocks,threadsPerBlock>>>(patchWidth,patchError,NNF,E);
-
-  checkCudaError(cudaDeviceSynchronize());
-}
-
-#endif