CS:APP Performance Lab
实验需求
在这个lab中,我们将考虑两种图像处理操作:Rotate:逆时针旋转图像90◦,和 Smooth:”平滑”或”模糊”图像。
旋转操作可以很简单地实现为以下两个矩阵操作的组合:
- 转置:对于每个(i,j)对,Mi,j 和 Mj,i被交换。
- 交换行:第 i 行与第 N−1−i 行交换
而平滑操作则是将每一个像素点的 RGB 值设置为以它为中心附近最多九块的平均值。
lab中会给出naive_rotate、naive_smooth作为基准代码,而我们的工作就是优化naive版本的代码。
Rotate
需求
在能够完成其原本功能的情况下利用各种优化方式去优化下述代码:
void naive_rotate(int dim, pixel *src, pixel *dst) {
int i, j;
for(i=0; i < dim; i++)
for(j=0; j < dim; j++)
dst[RIDX(dim-1-j,i,dim)] = src[RIDX(i,j,dim)];
return;
}
The solution
我们考虑利用处理器微体系结构的优化,这里使用到的技巧为循环展开,它的具体做法是通过增加每次迭代计算元素的数量,减少循环迭代次数。
循环展开能从三个方面优化程序性能:
- 它减少了不直接有助于程序结果的操作数的数量,例如循环索引的计算。
- 循环迭代次数减少使得分支预测错误减少。
- 每次迭代计算元素的数量增加,为并行运算提供了机会。多个元素,
而刚好我们处理元素时元素与元素之间不存在数据依赖,此时就可以利用流水线实现指令集并行。
展开时优先选择最外层循环进行展开,我们对最外层循环进行16*16展开。
优化代码如下:
void rotate(int dim, pixel *src, pixel *dst) {
int i, j, k, limit = dim - 15;
for (i = 0; i < limit; i+=16)//16*16循环展开
for (j = 0; j < dim; j++)
for(k = i; k < i + 16; k++)
dst[RIDX(dim-1-j, k, dim)] = src[RIDX(k, j, dim)];
for(; i < dim; i++)
for (j = 0; j < dim; j++)
dst[RIDX(dim-1-j, i, dim)] = src[RIDX(i, j, dim)];
}
经测试可得以下结果,优化后代码的Speedup是naive_rotate的两倍。
Rotate: Version = naive_rotate: Naive baseline implementation:
Dim 64 128 256 512 1024 Mean
Your CPEs 1.7 2.3 4.7 8.6 7.5
Baseline CPEs 14.7 40.1 46.4 65.9 94.5
Speedup 8.8 17.4 10.0 7.7 12.6 10.8
Rotate: Version = rotate: Current working version:
Dim 64 128 256 512 1024 Mean
Your CPEs 1.5 1.6 1.8 1.9 2.7
Baseline CPEs 14.7 40.1 46.4 65.9 94.5
Speedup 9.7 25.4 25.7 35.2 35.5 24.0
Smooth
需求
在能够完成其原本功能的情况下利用各种优化方式去优化下述代码:
typedef struct {
int red;
int green;
int blue;
int num;
} pixel_sum
static void accumulate_sum(pixel_sum *sum, pixel p)
{
sum->red += (int) p.red;
sum->green += (int) p.green;
sum->blue += (int) p.blue;
sum->num++;
return;
}
static void assign_sum_to_pixel(pixel *current_pixel, pixel_sum sum)
{
current_pixel->red = (unsigned short) (sum.red/sum.num);
current_pixel->green = (unsigned short) (sum.green/sum.num);
current_pixel->blue = (unsigned short) (sum.blue/sum.num);
return;
}
static pixel avg(int dim, int i, int j, pixel *src)
{
int ii, jj;
pixel_sum sum;
pixel current_pixel;
initialize_pixel_sum(&sum);
for(ii = max(i-1, 0); ii <= min(i+1, dim-1); ii++)
for(jj = max(j-1, 0); jj <= min(j+1, dim-1); jj++)
accumulate_sum(&sum, src[RIDX(ii, jj, dim)]);
assign_sum_to_pixel(¤t_pixel, sum);
return current_pixel;
}
void naive_smooth(int dim, pixel *src, pixel *dst)
{
int i, j;
for (i = 0; i < dim; i++)
for (j = 0; j < dim; j++)
dst[RIDX(i, j, dim)] = avg(dim, i, j, src);
}
The solution
通过观察可以发现上述代码有大量的过程调用,故我们主要的优化方向就是减少过程调用,根据平滑操作的定义,我们分别处理四个角、四条边、图像内部像素点,图像内部像素点数量最多,关键就是要减少处理图像内部像素点的函数调用。
优化代码如下:
pixel get_vertical(pixel *src, int dim, int idx){
pixel dst;
dst.red = (src[idx].red + src[idx+1].red + src[idx-dim].red + src[idx-dim+1].red + src[idx+dim].red + src[idx+dim+1].red) / 6;
dst.blue = (src[idx].blue + src[idx+1].blue + src[idx-dim].blue + src[idx-dim+1].blue + src[idx+dim].blue + src[idx+dim+1].blue) / 6;
dst.green = (src[idx].green + src[idx+1].green + src[idx-dim].green + src[idx-dim+1].green + src[idx+dim].green + src[idx+dim+1].green) / 6;
return dst;
}
pixel get_across(pixel *src, int dim, int idx){
pixel dst;
dst.red = (src[idx].red + src[idx+1].red + src[idx-1].red + src[idx+dim-1].red + src[idx+dim].red + src[idx+dim+1].red) / 6;
dst.blue = (src[idx].blue + src[idx+1].blue + src[idx-1].blue + src[idx+dim-1].blue + src[idx+dim].blue + src[idx+dim+1].blue) / 6;
dst.green = (src[idx].green + src[idx+1].green + src[idx-1].green + src[idx+dim-1].green + src[idx+dim].green + src[idx+dim+1].green) / 6;
return dst;
}
void smooth(int dim, pixel *src, pixel *dst)
{
dst[0].red = (src[0].red + src[1].red + src[dim + 1].red + src[dim].red) / 4;//左上角
dst[0].blue = (src[0].blue + src[1].blue + src[dim + 1].blue + src[dim].blue) / 4;
dst[0].green = (src[0].green + src[1].green + src[dim + 1].green + src[dim].green) / 4;
dst[dim * (dim-1)].red = (src[dim * (dim-1)].red + src[dim * (dim-1) + 1].red + src[dim * (dim-2)].red + src[dim * (dim-2) + 1].red) / 4;//左下角
dst[dim * (dim-1)].blue =(src[dim * (dim-1)].blue + src[dim * (dim-1) + 1].blue + src[dim * (dim-2)].blue + src[dim * (dim-2) + 1].blue) / 4;
dst[dim * (dim-1)].green =(src[dim * (dim-1)].green + src[dim * (dim-1) + 1].green + src[dim * (dim-2)].green + src[dim * (dim-2) + 1].green) / 4;
dst[dim -1].red = (src[dim - 1].red + src[dim - 2].red + src[dim + dim -1].red + src[dim + dim -2].red) / 4;//右上角
dst[dim -1].blue = (src[dim - 1].blue + src[dim - 2].blue + src[dim + dim -1].blue + src[dim + dim -2].blue) / 4;
dst[dim -1].green = (src[dim - 1].green + src[dim - 2].green + src[dim + dim -1].green + src[dim + dim -2].green) / 4;
dst[(dim *dim -1)].red = (src[dim *dim -1].red + src[dim * dim -2].red + src[dim * (dim-1) -1].red + src[dim * (dim-1) - 2].red) / 4;//右下角
dst[(dim *dim -1)].blue = (src[dim *dim -1].blue + src[dim * dim -2].blue + src[dim * (dim-1) -1].blue + src[dim * (dim-1) - 2].blue) / 4;
dst[(dim *dim -1)].green = (src[dim *dim -1].green + src[dim * dim -2].green + src[dim * (dim-1) -1].green + src[dim * (dim-1) - 2].green) / 4;
int i, j;
int limit = dim - 1;
for (i = 1; i < limit; i++){//平滑最左右两列
dst[dim * i] = get_vertical(src, dim, dim * i);
dst[dim * (i+1) -1] = get_vertical(src, dim, dim * (i+1) -2);
}
for (j = 1; j < limit; j++){//平滑最上下两行
dst[j] = get_across(src, dim, j);
dst[dim * (dim-1) + j] = get_across(src, dim, dim * (dim-2) + j);
}
for(i = 1; i < limit; i++){//平滑内部
int t = i * dim;
for(j = 1; j < limit; j++){
dst[t+j].red = (src[t+j].red + src[t+j+1].red + src[t+j-1].red + src[t+j+dim].red + src[t+j+dim+1].red + src[t+j+dim-1].red + src[t+j-dim].red + src[t+j-dim+1].red + src[t+j-dim-1].red) / 9;
dst[t+j].blue = (src[t+j].blue + src[t+j+1].blue + src[t+j-1].blue + src[t+j+dim].blue + src[t+j+dim+1].blue + src[t+j+dim-1].blue + src[t+j-dim].blue + src[t+j-dim+1].blue + src[t+j-dim-1].blue) / 9;
dst[t+j].green = (src[t+j].green + src[t+j+1].green + src[t+j-1].green + src[t+j+dim].green + src[t+j+dim+1].green + src[t+j+dim-1].green + src[t+j-dim].green + src[t+j-dim+1].green + src[t+j-dim-1].green) / 9;
}
}
}
经测试可得以下结果,优化后代码的Speedup是naive_smooth的三倍。
Smooth: Version = smooth: Current working version:
Dim 32 64 128 256 512 Mean
Your CPEs 12.4 11.8 11.4 11.2 11.4
Baseline CPEs 695.0 698.0 702.0 717.0 722.0
Speedup 56.0 59.2 61.5 64.2 63.6 60.8
Smooth: Version = naive_smooth: Naive baseline implementation:
Dim 32 64 128 256 512 Mean
Your CPEs 36.6 36.7 35.9 58.3 36.2
Baseline CPEs 695.0 698.0 702.0 717.0 722.0
Speedup 19.0 19.0 19.6 12.3 19.9 17.7