Tutorial 11. Kernel Programming: Register UsageΒΆ
Here we show three different algorithms for the seemingly simple linear filtering. All three algorithms use 2 1-d convolutions, horizontal then vertical, to implement the 2-d convolution. This approach minimizes computation, however it needs some more storage for intermediate results.
// change the algorithm from 2-d convolution to 2 1-d convolution.
// This change saves computation yet requires more registers.
// So it is a trade-off between time and space. On GEN, every EU-thread
// gets 128x32 bytes of regiser space. As long as kernels can stay within
// this limit, we should strive for computation efficiency.
extern "C" _GENX_MAIN_ void
linear1d2(SurfaceIndex ibuf, SurfaceIndex obuf)
{
matrix<uchar, 8, 32> in;
matrix<uchar, 6, 24> out;
matrix<short, 8, 24> m;
matrix<short, 6, 24> m_out;
// when we use media-walker, we can get thread-ids
// using the following intrinsic instead of using
// per-thread arguments
uint h_pos = get_thread_origin_x();
uint v_pos = get_thread_origin_y();
read(ibuf, h_pos*24, v_pos*6, in);
// sum up the input pixel values by columns
m = in.select<8,1,24,1>(0,0) + in.select<8,1,24,1>(0,3);
m += in.select<8,1,24,1>(0,6);
// sum up the m values by rows
m_out = m.select<6,1,24,1>(0,0) + m.select<6,1,24,1>(1,0);
m_out += m.select<6,1,24,1>(2,0);
out = m_out * 0.111f;
write(obuf, h_pos*24, v_pos*6, out);
}
// this version also use 2 1-d convolution to save computation.
// Unlike linear1d2, it uses a sliding window scheme to minimize
// the storage: 3 rows for both input and intermediate result,
// and one row for output. However, in this way, it loads input one
// row at a time, and store output one row at a time.
extern "C" _GENX_MAIN_ void
linearslide(SurfaceIndex ibuf, SurfaceIndex obuf)
{
matrix<uchar, 3, 32> in;
vector<uchar, 24> out;
matrix<short, 3, 24> m;
vector<short, 24> m_out;
// when we use media-walker, we can get thread-ids
// using the following intrinsic instead of using
// per-thread arguments
uint h_pos = get_thread_origin_x();
uint v_pos = get_thread_origin_y();
// reads the first 3 rows
read(ibuf, h_pos*24, v_pos*6, in);
// sum up the input pixel values by columns
m = in.select<3,1,24,1>(0,0)
+ in.select<3,1,24,1>(0,3)
+ in.select<3,1,24,1>(0,6);
#pragma unroll
for (int i = 0; i < 5; ++i) {
// sum up the m values by rows
m_out = m.row(0) + m.row(1) + m.row(2);
out = m_out * 0.111f;
// write out one row
write(obuf, h_pos*24, v_pos*6+i, out);
// read in the next row
read(ibuf, h_pos*24, v_pos*6+i+3, in.row(i%3));
// sum up pixels by columns
m.row(i%3) = in.select<1,1,24,1>(i%3,0)
+ in.select<1,1,24,1>(i%3,3)
+ in.select<1,1,24,1>(i%3,6);
}
// sum up the m values by rows
m_out = m.row(0) + m.row(1) + m.row(2);
out = m_out * 0.111f;
// write out the last row
write(obuf, h_pos*24, v_pos*6+5, out);
}
// This is another version of using sliding window.
// However it only minimizes the intermediate result to 3 rows.
// It keeps the entire input block and output block in registers
// in order to utilize the large media-block read and write.
extern "C" _GENX_MAIN_ void
linearslide2(SurfaceIndex ibuf, SurfaceIndex obuf)
{
matrix<uchar, 8, 32> in;
matrix<uchar, 6, 24> out;
matrix<short, 3, 24> m;
vector<short, 24> m_out;
// when we use media-walker, we can get thread-ids
// using the following intrinsic instead of using
// per-thread arguments
uint h_pos = get_thread_origin_x();
uint v_pos = get_thread_origin_y();
// read 8x32 block
read(ibuf, h_pos*24, v_pos*6, in);
// sum up the first 3-row input values by columns
m = in.select<3,1,24,1>(0,0)
+ in.select<3,1,24,1>(0,3)
+ in.select<3,1,24,1>(0,6);
#pragma unroll
for (int i = 0; i < 5; ++i) {
// sum up the m values by rows
m_out = m.row(0) + m.row(1) + m.row(2);
out.row(i) = m_out * 0.111f;
// update one row of m
m.row(i%3) = in.select<1,1,24,1>(i+3,0)
+ in.select<1,1,24,1>(i+3,3)
+ in.select<1,1,24,1>(i+3,6);
}
// sum up the m values by rows
m_out = m.row(0) + m.row(1) + m.row(2);
out.row(5) = m_out * 0.111f;
// write 6x24 block
write(obuf, h_pos*24, v_pos*6, out);
}