Tutorial 2. Basic Kernel ProgrammingΒΆ

This is the kernel program for the linear filtering. In this example, you can see how to get thread-ids when using media-walker. Also how to use the CM matrix type, and the select operation.

#include <cm/cm.h>

// Linear filter: average neighbors for each pixel
// For Each pixel
//     For Each RGB Channel
//         I(x,y)=[I(x-1, y-1) + I(x-1, y) + I(x-1, y+1) +
//                 I(x, y-1) + I(x, y) + I(x, y+1) +
//                 I(x+1, y-1) + I(x+1, y) + I(x+1, y+1)]/9
//
// Every thread outputs results for a 6x8 pixel-block
// Since the data is R8G8B8, therefore the output is
// a 6x24 matrix of uchar elements
//
// _GENX_MAIN_ attribute means this function is a kernel entry
// SurfaceIndex ibuf is input surface
// SurfaceIndex obuf is output surface
extern "C" _GENX_MAIN_ void
linear(SurfaceIndex ibuf, SurfaceIndex obuf)
{
    // declare 8x32 input matrix of uchar elements
    // Note: 8x30 is sufficient for the computation
    // however block-read only reads the multiple of dwords
    matrix<uchar, 8, 32> in;
    // declare 6x24 output matrix of uchar elements
    matrix<uchar, 6, 24> out;
    // declare intermediate matrix for summation
    matrix<float, 6, 24> m;

    // when we use media-walker, we can get thread-ids
    // using the following intrinsic instead of using
    // per-thread arguments
    uint h_pos = get_thread_origin_x();
    uint v_pos = get_thread_origin_y();

    // 2D media-block read from surface to input
    read(ibuf, h_pos * 24, v_pos * 6, in);
    // copy 6x24 block from in-matrix starting at corner (v1,h3)
    // to m-matrix
    // <6,1,24,1> means height is 6, vertical stride is 1
    // width is 24, horizontal stride is 1
    m = in.select<6, 1, 24, 1>(1, 3);
    // add 6x24 block from in-matrix starting at corner(v0,h0)
    // to m-matrix
    m += in.select<6, 1, 24, 1>(0, 0);
    // add 6x24 block from in-matrix starting at corner(v0,h3)
    // to m-matrix
    m += in.select<6, 1, 24, 1>(0, 3);
    // add 6x24 block from in-matrix starting at corner(v0,h6)
    // to m-matrix
    m += in.select<6, 1, 24, 1>(0, 6);
    // add 6x24 block from in-matrix starting at corner(v1,h0)
    // to m-matrix
    m += in.select<6, 1, 24, 1>(1, 0);
    // add 6x24 block from in-matrix starting at corner(v1,h6)
    // to m-matrix
    m += in.select<6, 1, 24, 1>(1, 6);
    // add 6x24 block from in-matrix starting at corner(v2,h0)
    // to m-matrix
    m += in.select<6, 1, 24, 1>(2, 0);
    // add 6x24 block from in-matrix starting at corner(v2,h3)
    // to m-matrix
    m += in.select<6, 1, 24, 1>(2, 3);
    // add 6x24 block from in-matrix starting at corner(v2,h6)
    // to m-matrix
    m += in.select<6, 1, 24, 1>(2, 6);
    // divide by 9 approximately, mul is faster
    // implicit type conversion from float to uchar
    out = m * 0.111f;
    // 2D media-block write to surface
    write(obuf, h_pos * 24, v_pos * 6, out);
}