热度 14
2016-3-7 15:27
867 次阅读|
0 个评论
==== 函数原型 ==== code 代码原型,不是我写的。只是拿来copy。 static LA_bool xxx(unsigned char *src, int Width, int Height, short *table1, short *table2, int dstWidth, int dstHeight, int nchanner, unsigned char *dst, Rect_S stRect) { int sx, sy; int i, j; int stepSrc = Width * nchanner; int stepDstMapxy = dstWidth; int stepDstMapCoef = dstWidth * 2; short cof00, cof01, cof10, cof11; int offset1, offset2; //int r, g, b; int dstoff = 0; int coflinestart; int dstlinestart; int *xy_tab = (int *)table1; int *cof_tab = (int *)table2; int xyval = 0; int xyoff, coff; int cofval1, cofval2; unsigned char r00, g00, b00, r01, g01, b01, r10, g10, b10, r11, g11, b11; char *p1, *p2; int rgb0, rgb1, rgb2, rgb3; unsigned char *pDtmp; for (j = stRect.top; j stRect.bottom; ++j) { coflinestart = j*stepDstMapCoef; dstlinestart = j*dstWidth * 3; pDtmp = dst + dstlinestart + stRect.left * 3; for (i = stRect.left; i stRect.right; ++i) { xyoff = j*stepDstMapxy + i; xyval = *(xy_tab + xyoff); sy = (xyval 16) 0x0000ffff; sx = xyval 0x0000ffff; coff = coflinestart + (i 1); cofval1 = *(cof_tab + coff); cofval2 = *(cof_tab + 1 + coff); cof01 = (cofval1 16) 0x0000ffff; cof00 = cofval1 0x0000ffff; cof11 = (cofval2 16) 0x0000ffff; cof10 = cofval2 0x0000ffff; offset1 = (sy * stepSrc + sx*nchanner); offset2 = offset1 + stepSrc; p1 = (char *)(src + offset1); p2 = (char *)(src + offset2); r00 = *p1++; g00 = *p1++; b00 = *p1++; r01 = *p1++; g01 = *p1++; b01 = *p1++; r10 = *p2++; g10 = *p2++; b10 = *p2++; r11 = *p2++; g11 = *p2++; b11 = *p2++; *(pDtmp++) = (unsigned char)((r00 * cof00 + r10 * cof01 + r01 * cof10 + r11 * cof11) BITOFF); *(pDtmp++) = (unsigned char)((g00 * cof00 + g10 * cof01 + g01 * cof10 + g11 * cof11) BITOFF); *(pDtmp++) = (unsigned char)((b00 * cof00 + b10 * cof01 + b01 * cof10 + b11 * cof11) BITOFF); } } return 1; } /code ==== 优化后代码 ==== code static int ImgIn_w = 800; static int ImgIn_h = 600; static int Img_w = 600; static int Img_h = 800; static int PixelSize = 2; static void Load_Dot16_YUV422(int stRect_left, int stRect_right, int stRect_top, int stRect_bottom, int ImageIndex, int ImageOutIndex) { unsigned int src0 = pImgS1In; unsigned int dst0 = pImgS1Out; unsigned int xy_tab0 = pParam_Loadx4XY; unsigned int cof_tab0 = pParam_Dot16Cof; int Index_i, Index_j; unsigned char* src; unsigned char * src1; register tu32 XY_tab_Addr; unsigned long long *pXY_tab; int i_LoopNum; int j_offset; unsigned long long offset_12; int offset_load_1; int offset_load_1_is; unsigned long long data_load_A_1; unsigned long long data_load_B_1; int offset_load_2; int offset_load_2_is; unsigned long long data_load_A_2; unsigned long long data_load_B_2; unsigned long long *pCof_tab; unsigned long long *pCof_tab1; unsigned long long Cof_ABCD_1; unsigned long long Cof_ABCD_2; unsigned long long Data64_AND_00FF = 0x00FF00FF00FF00FF; unsigned int Data32_7654_A_1; unsigned int Data32_7654_B_1; unsigned int Data32_3210_A_2; unsigned int Data32_3210_B_2; unsigned long long Data64_DP2_76765454_1; unsigned long long Data64_DP2_32321010_2; unsigned long long Data64_DPH4_75753131_1; unsigned long long Data64_DPH4_75753131_2; unsigned long long Data64_MV55_75753131_1; unsigned long long Data64_MV33_75753131_2; unsigned long long Data64_SHFU_75753131_1; unsigned long long Data64_SHFU_75753131_2; unsigned int Data32_7575_1; unsigned int Data32_3131_1; unsigned int Data32_7575_2; unsigned int Data32_3131_2; int Shift_Num_1; int Shift_Num_2; //AB //CD unsigned long long Data64_Y0_0B0D0A0C; unsigned long long Data64_Y1_0B0D0A0C; unsigned long long Data64_U0_0B0D0A0C; unsigned long long Data64_V1_0B0D0A0C; __x128_t D128_Y1Y0_0B0D0A0C; __x128_t D128_V1U0_0B0D0A0C; __x128_t D128_C1C0_0B0D0A0C; unsigned long long D64_Y1Y0_Dot16; unsigned long long D64_V1U0_Dot16; unsigned long long D64_Y1Y0_SHRU; unsigned long long D64_V1U0_SHRU; unsigned long long D64_V1_Y1_U0_Y0; unsigned int D32_V1_Y1; unsigned int D32_U0_Y0; unsigned int D32_V1Y1U0Y0; src = src0; src1 = src0 + ImgIn_w * PixelSize; //图层偏址 xy_tab0+=Img_h*Img_w*4*ImageIndex; cof_tab0+=Img_h*Img_w*(13)*ImageIndex; dst0 +=Img_h*Img_w*2*ImageOutIndex; //windows偏址 { j_offset = stRect_top*Img_w; xy_tab0 = xy_tab0 + (j_offset2); cof_tab0= cof_tab0 + (j_offset3); dst0 = dst0 + (j_offset1); } //line偏址 { j_offset = stRect_left; xy_tab0 = xy_tab0 + (j_offset2); cof_tab0= cof_tab0 + (j_offset3); dst0 = dst0 + (j_offset1); } //行循环 i_LoopNum = stRect_right-stRect_left; i_LoopNum = i_LoopNum1; for (Index_j = stRect_top; Index_j stRect_bottom; Index_j++) { //pre_init XY_tab_Addr = xy_tab0; pXY_tab = XY_tab_Addr; xy_tab0+=(Img_w2); pCof_tab = cof_tab0; cof_tab0+=(Img_w3); pDtmp = dst0; dst0+=(Img_w1); //pre_Loop { pCof_tab1 = pCof_tab ; offset_12 = *pXY_tab++;//C1 offset_load_1 = _loll(offset_12);//C1 offset_load_2 = _hill(offset_12);//C1 data_load_A_1 = _mem8(src+offset_load_1);//C1 data_load_B_1 = _mem8(src1+offset_load_1);//C1 data_load_A_2 = _mem8(src+offset_load_2);//C1 data_load_B_2 = _mem8(src1+offset_load_2);//C1 offset_load_1_is = offset_load_12;//C1 offset_load_2_is = offset_load_22;//C1 offset_12 = *pXY_tab++;//C2 } for (Index_i =0; Index_i i_LoopNum+1; Index_i++) { Data32_7654_A_1 = _hill(data_load_A_1); Data32_7654_B_1 = _hill(data_load_B_1); Data64_DP2_76765454_1 = _dpack2(Data32_7654_B_1, Data32_7654_A_1); Data32_3210_A_2 = _loll(data_load_A_2); Data32_3210_B_2 = _loll(data_load_B_2); Data64_DP2_32321010_2 = _dpack2(Data32_3210_B_2, Data32_3210_A_2); Data64_DPH4_75753131_1 = _dpackh4(data_load_B_1, data_load_A_1); //76543210 765432107575 3131 Data64_DPH4_75753131_2 = _dpackh4(data_load_B_2, data_load_A_2); //76543210 765432107575 3131 Data32_7575_1 = _hill(Data64_DPH4_75753131_1); Data32_3131_1 = _loll(Data64_DPH4_75753131_1); Shift_Num_1 = 8; if(offset_load_1_is==0) { Data32_3131_1 = Data32_7575_1; Shift_Num_1 = 0; } Data64_MV55_75753131_1 = _itoll(Data32_7575_1, Data32_3131_1); Data64_SHFU_75753131_1 = _dshru(Data64_MV55_75753131_1, Shift_Num_1); Data32_7575_2 = _hill(Data64_DPH4_75753131_2); Data32_3131_2 = _loll(Data64_DPH4_75753131_2); Shift_Num_2 = 0; if(offset_load_2_is==0) { Data32_7575_2 = Data32_3131_2; Shift_Num_2 = 8; } Data64_MV33_75753131_2 = _itoll(Data32_7575_2, Data32_3131_2); Data64_SHFU_75753131_2 = _dshru(Data64_MV33_75753131_2, Shift_Num_2); Data64_Y0_0B0D0A0C = Data64_DP2_76765454_1 Data64_AND_00FF; Data64_Y1_0B0D0A0C = Data64_DP2_32321010_2 Data64_AND_00FF; Data64_U0_0B0D0A0C = Data64_SHFU_75753131_1 Data64_AND_00FF; Data64_V1_0B0D0A0C = Data64_SHFU_75753131_2 Data64_AND_00FF; //这里是循环优化begin offset_load_1 = _loll(offset_12);//C2 offset_load_2 = _hill(offset_12);//C2 data_load_A_1 = _mem8(src+offset_load_1);//C2 data_load_B_1 = _mem8(src1+offset_load_1);//C2 data_load_A_2 = _mem8(src+offset_load_2);//C2 data_load_B_2 = _mem8(src1+offset_load_2);//C2 offset_load_1_is = offset_load_12;//C2 offset_load_2_is = offset_load_22;//C2 offset_12 = *pXY_tab++;//C3 //这里是循环优化end //----------------- Cof_ABCD_1 = *pCof_tab;pCof_tab+=2; Cof_ABCD_2 = *pCof_tab1;pCof_tab1+=2; D128_C1C0_0B0D0A0C = _llto128(Cof_ABCD_2, Cof_ABCD_1); D128_Y1Y0_0B0D0A0C = _llto128(Data64_Y1_0B0D0A0C, Data64_Y0_0B0D0A0C); D128_V1U0_0B0D0A0C = _llto128(Data64_V1_0B0D0A0C, Data64_U0_0B0D0A0C); D64_Y1Y0_SHRU = _dshr(D64_Y1Y0_Dot16, BITOFF);//dot延迟,流水线输出超越处理 D64_V1U0_SHRU = _dshr(D64_V1U0_Dot16, BITOFF); D64_Y1Y0_Dot16 = _ddotpsu4h(D128_Y1Y0_0B0D0A0C, D128_C1C0_0B0D0A0C); D64_V1U0_Dot16 = _ddotpsu4h(D128_V1U0_0B0D0A0C, D128_C1C0_0B0D0A0C); D64_V1_Y1_U0_Y0 = _dpackl4(D64_V1U0_SHRU, D64_Y1Y0_SHRU); D32_V1_Y1 = _hill(D64_V1_Y1_U0_Y0); D32_U0_Y0 = _loll(D64_V1_Y1_U0_Y0); D32_V1Y1U0Y0 = _packl4(D32_V1_Y1, D32_U0_Y0); if(Index_i) { *pDtmp++ = D32_V1Y1U0Y0; } //----------------- } } return ; } /code