Efficient, high-quality Bayer demosaic filtering on GPUs

In jgt 2009

Morgan McGuire, Williams College

Preprint (PDF)
Final Version (PDF)
OpenCL Shader
GLSL Shader
Abstract
BibTex

Libraries using this code:

  • Andrew Straw (CalTech) integrated this shader into the Motmot camera utilities.
  • The G3D Engine uses this shader for demosaicing.

Abstract

This paper describes a series of optimizations for implementing the high-quality Malvar-He-Cutler Bayer demosaicing filter on a GPU in OpenGL. Applying this filter is the first step in most video processing pipelines, but is generally considered too slow for real-time on a CPU. The optimized implementation contains 66% fewer ALU operations than a direct GPU implementation and can filter 40 simultaneous HD 1080p video streams at 30 fps (2728 Mpix/s) on current hardware. It is 2-3 times faster than a straightforward GPU implementation of the same algorithm on many GPUs. Most of the optimizations are applicable to other kinds of processors that support SIMD instructions, like CPUs and DSPs.

BibTex


@article{mcguire09bayer,
 author = {Morgan Mc{G}uire},
 title = {Efficient, High-Quality Bayer Demosaic Filtering on GPUs},
 journal = {Submitted to Journal of Graphics Tools},
 year = {2009},
 publisher = {AK Peters},
 address = {Wellesley, MA}
}

Pixel Shader


/** Monochrome RGBA or GL_LUMINANCE Bayer encoded texture.*/
uniform sampler2D       source;

varying vec4            center;
varying vec4            yCoord;
varying vec4            xCoord;

void main(void) {
    
    #define fetch(x, y) texture2D(source, vec2(x, y)).r

    float C = texture2D(source, center.xy).r; // ( 0, 0)
    const vec4 kC = vec4( 4.0,  6.0,  5.0,  5.0) / 8.0;

    // Determine which of four types of pixels we are on.
    vec2 alternate = mod(floor(center.zw), 2.0);

    vec4 Dvec = vec4(
        fetch(xCoord[1], yCoord[1]),  // (-1,-1)
        fetch(xCoord[1], yCoord[2]),  // (-1, 1)
        fetch(xCoord[2], yCoord[1]),  // ( 1,-1)
        fetch(xCoord[2], yCoord[2])); // ( 1, 1)       

    vec4 PATTERN = (kC.xyz * C).xyzz;

    // Can also be a dot product with (1,1,1,1) on hardware where that is
    // specially optimized.
    // Equivalent to: D = Dvec[0] + Dvec[1] + Dvec[2] + Dvec[3];
    Dvec.xy += Dvec.zw;
    Dvec.x  += Dvec.y;
        
    vec4 value = vec4(  
        fetch(center.x, yCoord[0]),   // ( 0,-2)
        fetch(center.x, yCoord[1]),   // ( 0,-1)
        fetch(xCoord[0], center.y),   // (-1, 0)
        fetch(xCoord[1], center.y));  // (-2, 0)
             
    vec4 temp = vec4(
        fetch(center.x, yCoord[3]),   // ( 0, 2)
        fetch(center.x, yCoord[2]),   // ( 0, 1)
        fetch(xCoord[3], center.y),   // ( 2, 0)
        fetch(xCoord[2], center.y));  // ( 1, 0)

    // Even the simplest compilers should be able to constant-fold these to avoid the division.
    // Note that on scalar processors these constants force computation of some identical products twice.
    const vec4 kA = vec4(-1.0, -1.5,  0.5, -1.0) / 8.0;
    const vec4 kB = vec4( 2.0,  0.0,  0.0,  4.0) / 8.0;
    const vec4 kD = vec4( 0.0,  2.0, -1.0, -1.0) / 8.0;        
            
    // Conserve constant registers and take advantage of free swizzle on load
    #define kE (kA.xywz)
    #define kF (kB.xywz)
    
    value += temp;
    
    // There are five filter patterns (identity, cross, checker,
    // theta, phi).  Precompute the terms from all of them and then
    // use swizzles to assign to color channels. 
    //
    // Channel   Matches
    //   x       cross   (e.g., EE G)
    //   y       checker (e.g., EE B)
    //   z       theta   (e.g., EO R)
    //   w       phi     (e.g., EO R)
    
    #define A (value[0])
    #define B (value[1])
    #define D (Dvec.x)
    #define E (value[2])
    #define F (value[3])
    
    // Avoid zero elements. On a scalar processor this saves two MADDs and it has no
    // effect on a vector processor.
    PATTERN.yzw += (kD.yz * D).xyy;

    PATTERN += (kA.xyz * A).xyzx + (kE.xyw * E).xyxz;
    PATTERN.xw  += kB.xw * B;
    PATTERN.xz  += kF.xz * F;
    
    
    gl_FragColor.rgb = (alternate.y == 0.0) ?
        ((alternate.x == 0.0) ?
            vec3(C, PATTERN.xy) :
            vec3(PATTERN.z, C, PATTERN.w)) :
        ((alternate.x == 0.0) ?
            vec3(PATTERN.w, C, PATTERN.z) :
            vec3(PATTERN.yx, C));
           
}

Vertex Shader


/** (w,h,1/w,1/h) */
uniform vec4            sourceSize;

/** Pixel position of the first red pixel in the Bayer pattern.  [{0,1}, {0, 1}]*/
uniform vec2            firstRed;

/** .xy = Pixel being sampled in the fragment shader on the range [0, 1]
    .zw = ...on the range [0, sourceSize], offset by firstRed */
varying vec4            center;

/** center.x + (-2/w, -1/w, 1/w, 2/w); These are the x-positions of the adjacent pixels.*/
varying vec4            xCoord;

/** center.y + (-2/h, -1/h, 1/h, 2/h); These are the y-positions of the adjacent pixels.*/
varying vec4            yCoord;

void main(void) {

    center.xy = gl_MultiTexCoord0.xy;
    center.zw = gl_MultiTexCoord0.xy * sourceSize.xy + firstRed;
    
    vec2 invSize = sourceSize.zw;
    xCoord = center.x + vec4(-2.0 * invSize.x, -invSize.x, invSize.x, 2.0 * invSize.x);
    yCoord = center.y + vec4(-2.0 * invSize.y, -invSize.y, invSize.y, 2.0 * invSize.y);
    
    gl_Position = gl_ModelViewProjectionMatrix * gl_Vertex;
}