原创 DSP双线性差值代码优化

2016-3-7 15:27 843 14 14 分类: 消费电子
==== 函数原型 ====
 
<code>
代码原型,不是我写的。只是拿来copy。
static LA_bool xxx(unsigned char *src, int Width, int Height, short *table1, short *table2,
    int dstWidth, int dstHeight, int nchanner, unsigned char *dst, Rect_S stRect)
{
    int sx, sy;
    int i, j;
    int stepSrc = Width * nchanner;
    int stepDstMapxy = dstWidth;
    int stepDstMapCoef = dstWidth * 2;
    short cof00, cof01, cof10, cof11;
    int offset1, offset2;
    //int r, g, b;
    int dstoff = 0;
    int coflinestart;
    int dstlinestart;
    int *xy_tab = (int *)table1;
    int *cof_tab = (int *)table2;
    int xyval = 0;
    int xyoff, coff;
    int cofval1, cofval2;
    unsigned char r00, g00, b00, r01, g01, b01, r10, g10, b10, r11, g11, b11;
    char *p1, *p2;
    int rgb0, rgb1, rgb2, rgb3;
    unsigned char *pDtmp;
 
    for (j = stRect.top; j <stRect.bottom; ++j)
    {
        coflinestart = j*stepDstMapCoef;
        dstlinestart = j*dstWidth * 3;
        pDtmp = dst + dstlinestart + stRect.left * 3;
        for (i = stRect.left; i <stRect.right; ++i)
        {
            xyoff = j*stepDstMapxy + i;
            xyval = *(xy_tab + xyoff);
            sy = (xyval>> 16) & 0x0000ffff;
            sx = xyval & 0x0000ffff;
 
            coff = coflinestart + (i <<1);
 
            cofval1 = *(cof_tab + coff);
            cofval2 = *(cof_tab + 1 + coff);
            cof01 = (cofval1>> 16) & 0x0000ffff;
            cof00 = cofval1 & 0x0000ffff;
            cof11 = (cofval2>> 16) & 0x0000ffff;
            cof10 = cofval2 & 0x0000ffff;
 
            offset1 = (sy * stepSrc + sx*nchanner);
            offset2 = offset1 + stepSrc;
 
            p1 = (char *)(src + offset1);
            p2 = (char *)(src + offset2);
 
            r00 = *p1++;
            g00 = *p1++;
            b00 = *p1++;
            r01 = *p1++;
            g01 = *p1++;
            b01 = *p1++;
 
            r10 = *p2++;
            g10 = *p2++;
            b10 = *p2++;
            r11 = *p2++;
            g11 = *p2++;
            b11 = *p2++;
 
            *(pDtmp++) = (unsigned char)((r00 * cof00 + r10 * cof01 + r01 * cof10 + r11 * cof11)>> BITOFF);
            *(pDtmp++) = (unsigned char)((g00 * cof00 + g10 * cof01 + g01 * cof10 + g11 * cof11)>> BITOFF);
            *(pDtmp++) = (unsigned char)((b00 * cof00 + b10 * cof01 + b01 * cof10 + b11 * cof11)>> BITOFF);
        }
    }
    return 1;
}
</code>
 
==== 优化后代码 ====
 
<code>
static int ImgIn_w = 800;
static int ImgIn_h = 600;
static int Img_w = 600;
static int Img_h = 800;
static int PixelSize = 2;
static void Load_Dot16_YUV422(int stRect_left, int stRect_right, int stRect_top, int stRect_bottom, int ImageIndex, int ImageOutIndex)
{
    unsigned int src0 = pImgS1In;
    unsigned int dst0 = pImgS1Out;
    unsigned int xy_tab0 = pParam_Loadx4XY;
    unsigned int  cof_tab0 = pParam_Dot16Cof;
 
    int Index_i, Index_j;
    unsigned char* src;
    unsigned char * src1;
    register tu32 XY_tab_Addr;
    unsigned long long *pXY_tab;
 
    int i_LoopNum;
    int j_offset;
 
    unsigned long long offset_12;
    int offset_load_1;
    int offset_load_1_is;
    unsigned long long data_load_A_1;
    unsigned long long data_load_B_1;
 
    int offset_load_2;
    int offset_load_2_is;
    unsigned long long data_load_A_2;
    unsigned long long data_load_B_2;
 
    unsigned long long *pCof_tab;
    unsigned long long *pCof_tab1;
    unsigned long long Cof_ABCD_1;
    unsigned long long Cof_ABCD_2;
 
    unsigned long long Data64_AND_00FF = 0x00FF00FF00FF00FF;
 
    unsigned int Data32_7654_A_1;
    unsigned int Data32_7654_B_1;
    unsigned int Data32_3210_A_2;
    unsigned int Data32_3210_B_2;
 
    unsigned long long Data64_DP2_76765454_1;
    unsigned long long Data64_DP2_32321010_2;
 
    unsigned long long Data64_DPH4_75753131_1;
    unsigned long long Data64_DPH4_75753131_2;
 
    unsigned long long Data64_MV55_75753131_1;
    unsigned long long Data64_MV33_75753131_2;
 
    unsigned long long Data64_SHFU_75753131_1;
    unsigned long long Data64_SHFU_75753131_2;
 
    unsigned int Data32_7575_1;
    unsigned int Data32_3131_1;
    unsigned int Data32_7575_2;
    unsigned int Data32_3131_2;
    int Shift_Num_1;
    int Shift_Num_2;
 
    //AB
    //CD
    unsigned long long Data64_Y0_0B0D0A0C;
    unsigned long long Data64_Y1_0B0D0A0C;
    unsigned long long Data64_U0_0B0D0A0C;
    unsigned long long Data64_V1_0B0D0A0C;
 
    __x128_t D128_Y1Y0_0B0D0A0C;
    __x128_t D128_V1U0_0B0D0A0C;
    __x128_t D128_C1C0_0B0D0A0C;
    unsigned long long D64_Y1Y0_Dot16;
    unsigned long long D64_V1U0_Dot16;
    unsigned long long D64_Y1Y0_SHRU;
    unsigned long long D64_V1U0_SHRU;
    unsigned long long D64_V1_Y1_U0_Y0;
    unsigned int D32_V1_Y1;
    unsigned int D32_U0_Y0;
    unsigned int D32_V1Y1U0Y0;
 
    src = src0;
    src1 = src0 + ImgIn_w * PixelSize;
 
    //图层偏址
    xy_tab0+=Img_h*Img_w*4*ImageIndex;
    cof_tab0+=Img_h*Img_w*(1<<3)*ImageIndex;
    dst0 +=Img_h*Img_w*2*ImageOutIndex;
 
    //windows偏址
    {
        j_offset = stRect_top*Img_w;
        xy_tab0 = xy_tab0 + (j_offset<<2);
        cof_tab0= cof_tab0 + (j_offset<<3);
        dst0 = dst0 + (j_offset<<1);
    }
    //line偏址
    {
        j_offset = stRect_left;
        xy_tab0 = xy_tab0 + (j_offset<<2);
        cof_tab0= cof_tab0 + (j_offset<<3);
        dst0 = dst0 + (j_offset<<1);
    }
 
    //行循环
    i_LoopNum = stRect_right-stRect_left;
    i_LoopNum = i_LoopNum>>1;
 
    for (Index_j = stRect_top; Index_j <stRect_bottom; Index_j++)
    {
        //pre_init
        XY_tab_Addr = xy_tab0;
        pXY_tab = XY_tab_Addr;
        xy_tab0+=(Img_w<<2);
        pCof_tab = cof_tab0;
        cof_tab0+=(Img_w<<3);
        pDtmp = dst0;
        dst0+=(Img_w<<1);
 
        //pre_Loop
        {
            pCof_tab1 = &pCof_tab[1];
 
            offset_12 = *pXY_tab++;//C1
            offset_load_1 = _loll(offset_12);//C1
            offset_load_2 = _hill(offset_12);//C1
            data_load_A_1 =  _mem8(src+offset_load_1);//C1
            data_load_B_1 =  _mem8(src1+offset_load_1);//C1
            data_load_A_2 =  _mem8(src+offset_load_2);//C1
            data_load_B_2 =  _mem8(src1+offset_load_2);//C1
            offset_load_1_is = offset_load_1&2;//C1
            offset_load_2_is = offset_load_2&2;//C1
            offset_12 = *pXY_tab++;//C2
        }
 
        for (Index_i =0; Index_i <i_LoopNum+1; Index_i++)
        {
            Data32_7654_A_1 = _hill(data_load_A_1);
            Data32_7654_B_1 = _hill(data_load_B_1);
            Data64_DP2_76765454_1 = _dpack2(Data32_7654_B_1, Data32_7654_A_1);
            Data32_3210_A_2 = _loll(data_load_A_2);
            Data32_3210_B_2 = _loll(data_load_B_2);
            Data64_DP2_32321010_2 = _dpack2(Data32_3210_B_2, Data32_3210_A_2);
 
            Data64_DPH4_75753131_1 = _dpackh4(data_load_B_1, data_load_A_1);    //76543210 76543210>7575 3131
            Data64_DPH4_75753131_2 = _dpackh4(data_load_B_2, data_load_A_2);    //76543210 76543210>7575 3131
 
            Data32_7575_1 = _hill(Data64_DPH4_75753131_1);
            Data32_3131_1 = _loll(Data64_DPH4_75753131_1);
            Shift_Num_1 = 8;
            if(offset_load_1_is==0)
            {
                Data32_3131_1 = Data32_7575_1;
                Shift_Num_1 = 0;
            }
            Data64_MV55_75753131_1 = _itoll(Data32_7575_1, Data32_3131_1);
            Data64_SHFU_75753131_1 = _dshru(Data64_MV55_75753131_1, Shift_Num_1);
 
            Data32_7575_2 = _hill(Data64_DPH4_75753131_2);
            Data32_3131_2 = _loll(Data64_DPH4_75753131_2);
            Shift_Num_2 = 0;
            if(offset_load_2_is==0)
            {
                Data32_7575_2 = Data32_3131_2;
                Shift_Num_2 = 8;
            }
            Data64_MV33_75753131_2 = _itoll(Data32_7575_2, Data32_3131_2);
            Data64_SHFU_75753131_2 = _dshru(Data64_MV33_75753131_2, Shift_Num_2);
 
            Data64_Y0_0B0D0A0C = Data64_DP2_76765454_1 & Data64_AND_00FF;
            Data64_Y1_0B0D0A0C = Data64_DP2_32321010_2 & Data64_AND_00FF;
            Data64_U0_0B0D0A0C = Data64_SHFU_75753131_1 & Data64_AND_00FF;
            Data64_V1_0B0D0A0C = Data64_SHFU_75753131_2 & Data64_AND_00FF;
 
        //这里是循环优化begin
        offset_load_1 = _loll(offset_12);//C2
        offset_load_2 = _hill(offset_12);//C2
        data_load_A_1 =  _mem8(src+offset_load_1);//C2
        data_load_B_1 =  _mem8(src1+offset_load_1);//C2
        data_load_A_2 =  _mem8(src+offset_load_2);//C2
        data_load_B_2 =  _mem8(src1+offset_load_2);//C2
        offset_load_1_is = offset_load_1&2;//C2
        offset_load_2_is = offset_load_2&2;//C2
        offset_12 = *pXY_tab++;//C3
        //这里是循环优化end
//-----------------
 
            Cof_ABCD_1 = *pCof_tab;pCof_tab+=2;
            Cof_ABCD_2 = *pCof_tab1;pCof_tab1+=2;
 
            D128_C1C0_0B0D0A0C = _llto128(Cof_ABCD_2, Cof_ABCD_1);
            D128_Y1Y0_0B0D0A0C = _llto128(Data64_Y1_0B0D0A0C, Data64_Y0_0B0D0A0C);
            D128_V1U0_0B0D0A0C = _llto128(Data64_V1_0B0D0A0C, Data64_U0_0B0D0A0C);
 
                D64_Y1Y0_SHRU = _dshr(D64_Y1Y0_Dot16, BITOFF);//dot延迟,流水线输出超越处理
                D64_V1U0_SHRU = _dshr(D64_V1U0_Dot16, BITOFF);
 
            D64_Y1Y0_Dot16 = _ddotpsu4h(D128_Y1Y0_0B0D0A0C, D128_C1C0_0B0D0A0C);
            D64_V1U0_Dot16 = _ddotpsu4h(D128_V1U0_0B0D0A0C, D128_C1C0_0B0D0A0C);
 
                D64_V1_Y1_U0_Y0 = _dpackl4(D64_V1U0_SHRU, D64_Y1Y0_SHRU);
                D32_V1_Y1 = _hill(D64_V1_Y1_U0_Y0);
                D32_U0_Y0 = _loll(D64_V1_Y1_U0_Y0);
                D32_V1Y1U0Y0 = _packl4(D32_V1_Y1, D32_U0_Y0);
                if(Index_i)
                {
                    *pDtmp++ = D32_V1Y1U0Y0;
                }
 
//-----------------
 
        }
    }
    return ;
}
</code>
 

文章评论0条评论)

登录后参与讨论
我要评论
0
14
关闭 站长推荐上一条 /2 下一条