|
Efficient, high-quality Bayer demosaic filtering on GPUs
Abstract
This paper describes a series of optimizations for implementing the
high-quality Malvar-He-Cutler Bayer demosaicing filter on a GPU in
OpenGL. Applying this filter is the first step in most video
processing pipelines, but is generally considered too slow for
real-time on a CPU. The optimized implementation contains 66% fewer
ALU operations than a direct GPU implementation and can filter 40
simultaneous HD 1080p video streams at 30 fps (2728 Mpix/s) on current
hardware. It is 2-3 times faster than a straightforward GPU
implementation of the same algorithm on many GPUs. Most of the
optimizations are applicable to other kinds of processors that support
SIMD instructions, like CPUs and DSPs.
BibTex
@article{mcguire09bayer,
author = {Morgan Mc{G}uire},
title = {Efficient, High-Quality Bayer Demosaic Filtering on GPUs},
journal = {Submitted to Journal of Graphics Tools},
year = {2009},
publisher = {AK Peters},
address = {Wellesley, MA}
}
Pixel Shader
/** Monochrome RGBA or GL_LUMINANCE Bayer encoded texture.*/
uniform sampler2D source;
varying vec4 center;
varying vec4 yCoord;
varying vec4 xCoord;
void main(void) {
#define fetch(x, y) texture2D(source, vec2(x, y)).r
float C = texture2D(source, center.xy).r; // ( 0, 0)
const vec4 kC = vec4( 4.0, 6.0, 5.0, 5.0) / 8.0;
// Determine which of four types of pixels we are on.
vec2 alternate = mod(floor(center.zw), 2.0);
vec4 Dvec = vec4(
fetch(xCoord[1], yCoord[1]), // (-1,-1)
fetch(xCoord[1], yCoord[2]), // (-1, 1)
fetch(xCoord[2], yCoord[1]), // ( 1,-1)
fetch(xCoord[2], yCoord[2])); // ( 1, 1)
vec4 PATTERN = (kC.xyz * C).xyzz;
// Can also be a dot product with (1,1,1,1) on hardware where that is
// specially optimized.
// Equivalent to: D = Dvec[0] + Dvec[1] + Dvec[2] + Dvec[3];
Dvec.xy += Dvec.zw;
Dvec.x += Dvec.y;
vec4 value = vec4(
fetch(center.x, yCoord[0]), // ( 0,-2)
fetch(center.x, yCoord[1]), // ( 0,-1)
fetch(xCoord[0], center.y), // (-1, 0)
fetch(xCoord[1], center.y)); // (-2, 0)
vec4 temp = vec4(
fetch(center.x, yCoord[3]), // ( 0, 2)
fetch(center.x, yCoord[2]), // ( 0, 1)
fetch(xCoord[3], center.y), // ( 2, 0)
fetch(xCoord[2], center.y)); // ( 1, 0)
// Even the simplest compilers should be able to constant-fold these to avoid the division.
// Note that on scalar processors these constants force computation of some identical products twice.
const vec4 kA = vec4(-1.0, -1.5, 0.5, -1.0) / 8.0;
const vec4 kB = vec4( 2.0, 0.0, 0.0, 4.0) / 8.0;
const vec4 kD = vec4( 0.0, 2.0, -1.0, -1.0) / 8.0;
// Conserve constant registers and take advantage of free swizzle on load
#define kE (kA.xywz)
#define kF (kB.xywz)
value += temp;
// There are five filter patterns (identity, cross, checker,
// theta, phi). Precompute the terms from all of them and then
// use swizzles to assign to color channels.
//
// Channel Matches
// x cross (e.g., EE G)
// y checker (e.g., EE B)
// z theta (e.g., EO R)
// w phi (e.g., EO R)
#define A (value[0])
#define B (value[1])
#define D (Dvec.x)
#define E (value[2])
#define F (value[3])
// Avoid zero elements. On a scalar processor this saves two MADDs and it has no
// effect on a vector processor.
PATTERN.yzw += (kD.yz * D).xyy;
PATTERN += (kA.xyz * A).xyzx + (kE.xyw * E).xyxz;
PATTERN.xw += kB.xw * B;
PATTERN.xz += kF.xz * F;
gl_FragColor.rgb = (alternate.y == 0.0) ?
((alternate.x == 0.0) ?
vec3(C, PATTERN.xy) :
vec3(PATTERN.z, C, PATTERN.w)) :
((alternate.x == 0.0) ?
vec3(PATTERN.w, C, PATTERN.z) :
vec3(PATTERN.yx, C));
}
Vertex Shader
/** (w,h,1/w,1/h) */
uniform vec4 sourceSize;
/** Pixel position of the first red pixel in the Bayer pattern. [{0,1}, {0, 1}]*/
uniform vec2 firstRed;
/** .xy = Pixel being sampled in the fragment shader on the range [0, 1]
.zw = ...on the range [0, sourceSize], offset by firstRed */
varying vec4 center;
/** center.x + (-2/w, -1/w, 1/w, 2/w); These are the x-positions of the adjacent pixels.*/
varying vec4 xCoord;
/** center.y + (-2/h, -1/h, 1/h, 2/h); These are the y-positions of the adjacent pixels.*/
varying vec4 yCoord;
void main(void) {
center.xy = gl_MultiTexCoord0.xy;
center.zw = gl_MultiTexCoord0.xy * sourceSize.xy + firstRed;
vec2 invSize = sourceSize.zw;
xCoord = center.x + vec4(-2.0 * invSize.x, -invSize.x, invSize.x, 2.0 * invSize.x);
yCoord = center.y + vec4(-2.0 * invSize.y, -invSize.y, invSize.y, 2.0 * invSize.y);
gl_Position = gl_ModelViewProjectionMatrix * gl_Vertex;
}
|