G2D圖像處理硬件調用和測試-基于米爾-全志T113-i開發(fā)板
MYC-YT113i核心板及開發(fā)板
真正的國產核心板,100%國產物料認證
國產T113-i處理器配備2*Cortex-A7@1.2GHz ,RISC-V
外置DDR3接口、支持視頻編解碼器、HiFi4 DSP
接口豐富:視頻采集接口、顯示器接口、USB2.0 接口、CAN 接口、千兆以太網接口
工業(yè)級:-40℃~+85℃、尺寸37mm*39mm
郵票孔+LGA,140+50PIN
全志 T113-i 2D圖形加速硬件支持情況
Supports layer size up to 2048 x 2048 pixels
Supports pre-multiply alpha image data
Supports color key
Supports two pipes Porter-Duff alpha blending
Supports multiple video formats 4:2:0, 4:2:2, 4:1:1 and multiple pixel formats (8/16/24/32 bits graphics
layer)
Supports memory scan order option
Supports any format convert function
Supports 1/16× to 32× resize ratio
Supports 32-phase 8-tap horizontal anti-alias filter and 32-phase 4-tap vertical anti-alias filter
Supports window clip
Supports FillRectangle, BitBlit, StretchBlit and MaskBlit
Supports horizontal and vertical flip, clockwise 0/90/180/270 degree rotate for normal buffer
Supports horizontal flip, clockwise 0/90/270 degree rotate for LBC buffer
可以看到 g2d 硬件支持相當多的2D圖像處理,包括顏色空間轉換,分辨率縮放,圖層疊加,旋轉等
開發(fā)環(huán)境配置
基礎開發(fā)環(huán)境搭建參考上上上一篇
https://bbs.elecfans.com/jishu_2408808_1_1.html
除了工具鏈外,我們使用 opencv-mobile 加載輸入圖片和保存結果,用來查看顏色轉換是否正常
g2d硬件直接采用標準的 Linux ioctl 操縱,只需要引入相關結構體定義即可,無需鏈接so
https://github.com/MYIR-ALLWINNER/framework/blob/develop-yt113-framework/auto/sdk_lib/include/g2d_driver.h
此外,g2d的輸入和輸出數據必須在dmaion buffer上,因此還需要dmaion.h頭文件,用來分配和釋放dmaion buffer
https://github.com/MYIR-ALLWINNER/framework/blob/develop-yt113-framework/auto/sdk_lib/include/DmaIon.h
基于C語言實現的YUV轉RGB
這里復用之前T113-i JPG解碼的函數
void yuv420sp2rgb(const unsigned char* yuv420sp, int w, int h, unsigned char* rgb)
{
const unsigned char* yptr = yuv420sp;
const unsigned char* vuptr = yuv420sp + w * h;
for (int y = 0; y < h; y += 2)
{
const unsigned char* yptr0 = yptr;
const unsigned char* yptr1 = yptr + w;
unsigned char* rgb0 = rgb;
unsigned char* rgb1 = rgb + w * 3;
int remain = w;
#define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255);
for (; remain > 0; remain -= 2)
{
// R = 1.164 * yy + 1.596 * vv
// G = 1.164 * yy - 0.813 * vv - 0.391 * uu
// B = 1.164 * yy + 2.018 * uu
// R = Y + (1.370705 * (V-128))
// G = Y - (0.698001 * (V-128)) - (0.337633 * (U-128))
// B = Y + (1.732446 * (U-128))
// R = ((Y << 6) + 87.72512 * (V-128)) >> 6
// G = ((Y << 6) - 44.672064 * (V-128) - 21.608512 * (U-128)) >> 6
// B = ((Y << 6) + 110.876544 * (U-128)) >> 6
// R = ((Y << 6) + 90 * (V-128)) >> 6
// G = ((Y << 6) - 46 * (V-128) - 22 * (U-128)) >> 6
// B = ((Y << 6) + 113 * (U-128)) >> 6
// R = (yy + 90 * vv) >> 6
// G = (yy - 46 * vv - 22 * uu) >> 6
// B = (yy + 113 * uu) >> 6
int v = vuptr[0] - 128;
int u = vuptr[1] - 128;
int ruv = 90 * v;
int guv = -46 * v + -22 * u;
int buv = 113 * u;
int y00 = yptr0[0] << 6;
rgb0[0] = SATURATE_CAST_UCHAR((y00 + ruv) >> 6);
rgb0[1] = SATURATE_CAST_UCHAR((y00 + guv) >> 6);
rgb0[2] = SATURATE_CAST_UCHAR((y00 + buv) >> 6);
int y01 = yptr0[1] << 6;
rgb0[3] = SATURATE_CAST_UCHAR((y01 + ruv) >> 6);
rgb0[4] = SATURATE_CAST_UCHAR((y01 + guv) >> 6);
rgb0[5] = SATURATE_CAST_UCHAR((y01 + buv) >> 6);
int y10 = yptr1[0] << 6;
rgb1[0] = SATURATE_CAST_UCHAR((y10 + ruv) >> 6);
rgb1[1] = SATURATE_CAST_UCHAR((y10 + guv) >> 6);
rgb1[2] = SATURATE_CAST_UCHAR((y10 + buv) >> 6);
int y11 = yptr1[1] << 6;
rgb1[3] = SATURATE_CAST_UCHAR((y11 + ruv) >> 6);
rgb1[4] = SATURATE_CAST_UCHAR((y11 + guv) >> 6);
rgb1[5] = SATURATE_CAST_UCHAR((y11 + buv) >> 6);
yptr0 += 2;
yptr1 += 2;
vuptr += 2;
rgb0 += 6;
rgb1 += 6;
}
#undef SATURATE_CAST_UCHAR
yptr += 2 * w;
rgb += 2 * 3 * w;
}
}
基于ARM neon指令集優(yōu)化的YUV轉RGB
考慮到armv7編譯器的自動neon優(yōu)化能力較差,這里針對性的編寫 arm neon inline assembly 實現YUV2RGB內核部分,達到最優(yōu)化的性能,榨干cpu性能
void yuv420sp2rgb_neon(const unsigned char* yuv420sp, int w, int h, unsigned char* rgb)
{
const unsigned char* yptr = yuv420sp;
const unsigned char* vuptr = yuv420sp + w * h;
#if __ARM_NEON
uint8x8_t _v128 = vdup_n_u8(128);
int8x8_t _v90 = vdup_n_s8(90);
int8x8_t _v46 = vdup_n_s8(46);
int8x8_t _v22 = vdup_n_s8(22);
int8x8_t _v113 = vdup_n_s8(113);
#endif // __ARM_NEON
for (int y = 0; y < h; y += 2)
{
const unsigned char* yptr0 = yptr;
const unsigned char* yptr1 = yptr + w;
unsigned char* rgb0 = rgb;
unsigned char* rgb1 = rgb + w * 3;
#if __ARM_NEON
int nn = w >> 3;
int remain = w - (nn << 3);
#else
int remain = w;
#endif // __ARM_NEON
#if __ARM_NEON
#if __aarch64__
for (; nn > 0; nn--)
{
int16x8_t _yy0 = vreinterpretq_s16_u16(vshll_n_u8(vld1_u8(yptr0), 6));
int16x8_t _yy1 = vreinterpretq_s16_u16(vshll_n_u8(vld1_u8(yptr1), 6));
int8x8_t _vvuu = vreinterpret_s8_u8(vsub_u8(vld1_u8(vuptr), _v128));
int8x8x2_t _vvvvuuuu = vtrn_s8(_vvuu, _vvuu);
int8x8_t _vv = _vvvvuuuu.val[0];
int8x8_t _uu = _vvvvuuuu.val[1];
int16x8_t _r0 = vmlal_s8(_yy0, _vv, _v90);
int16x8_t _g0 = vmlsl_s8(_yy0, _vv, _v46);
_g0 = vmlsl_s8(_g0, _uu, _v22);
int16x8_t _b0 = vmlal_s8(_yy0, _uu, _v113);
int16x8_t _r1 = vmlal_s8(_yy1, _vv, _v90);
int16x8_t _g1 = vmlsl_s8(_yy1, _vv, _v46);
_g1 = vmlsl_s8(_g1, _uu, _v22);
int16x8_t _b1 = vmlal_s8(_yy1, _uu, _v113);
uint8x8x3_t _rgb0;
_rgb0.val[0] = vqshrun_n_s16(_r0, 6);
_rgb0.val[1] = vqshrun_n_s16(_g0, 6);
_rgb0.val[2] = vqshrun_n_s16(_b0, 6);
uint8x8x3_t _rgb1;
_rgb1.val[0] = vqshrun_n_s16(_r1, 6);
_rgb1.val[1] = vqshrun_n_s16(_g1, 6);
_rgb1.val[2] = vqshrun_n_s16(_b1, 6);
vst3_u8(rgb0, _rgb0);
vst3_u8(rgb1, _rgb1);
yptr0 += 8;
yptr1 += 8;
vuptr += 8;
rgb0 += 24;
rgb1 += 24;
}
#else
if (nn > 0)
{
asm volatile(
"0: \n"
"pld [%3, #128] \n"
"vld1.u8 {d2}, [%3]! \n"
"vsub.s8 d2, d2, %12 \n"
"pld [%1, #128] \n"
"vld1.u8 {d0}, [%1]! \n"
"pld [%2, #128] \n"
"vld1.u8 {d1}, [%2]! \n"
"vshll.u8 q2, d0, #6 \n"
"vorr d3, d2, d2 \n"
"vshll.u8 q3, d1, #6 \n"
"vorr q9, q2, q2 \n"
"vtrn.s8 d2, d3 \n"
"vorr q11, q3, q3 \n"
"vmlsl.s8 q9, d2, %14 \n"
"vorr q8, q2, q2 \n"
"vmlsl.s8 q11, d2, %14 \n"
"vorr q10, q3, q3 \n"
"vmlal.s8 q8, d2, %13 \n"
"vmlal.s8 q2, d3, %16 \n"
"vmlal.s8 q10, d2, %13 \n"
"vmlsl.s8 q9, d3, %15 \n"
"vmlal.s8 q3, d3, %16 \n"
"vmlsl.s8 q11, d3, %15 \n"
"vqshrun.s16 d24, q8, #6 \n"
"vqshrun.s16 d26, q2, #6 \n"
"vqshrun.s16 d4, q10, #6 \n"
"vqshrun.s16 d25, q9, #6 \n"
"vqshrun.s16 d6, q3, #6 \n"
"vqshrun.s16 d5, q11, #6 \n"
"subs %0, #1 \n"
"vst3.u8 {d24-d26}, [%4]! \n"
"vst3.u8 {d4-d6}, [%5]! \n"
"bne 0b \n"
: "=r"(nn), // %0
"=r"(yptr0), // %1
"=r"(yptr1), // %2
"=r"(vuptr), // %3
"=r"(rgb0), // %4
"=r"(rgb1) // %5
: "0"(nn),
"1"(yptr0),
"2"(yptr1),
"3"(vuptr),
"4"(rgb0),
"5"(rgb1),
"w"(_v128), // %12
"w"(_v90), // %13
"w"(_v46), // %14
"w"(_v22), // %15
"w"(_v113) // %16
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "d26");
}
#endif // __aarch64__
#endif // __ARM_NEON
#define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255);
for (; remain > 0; remain -= 2)
{
// R = 1.164 * yy + 1.596 * vv
// G = 1.164 * yy - 0.813 * vv - 0.391 * uu
// B = 1.164 * yy + 2.018 * uu
// R = Y + (1.370705 * (V-128))
// G = Y - (0.698001 * (V-128)) - (0.337633 * (U-128))
// B = Y + (1.732446 * (U-128))
// R = ((Y << 6) + 87.72512 * (V-128)) >> 6
// G = ((Y << 6) - 44.672064 * (V-128) - 21.608512 * (U-128)) >> 6
// B = ((Y << 6) + 110.876544 * (U-128)) >> 6
// R = ((Y << 6) + 90 * (V-128)) >> 6
// G = ((Y << 6) - 46 * (V-128) - 22 * (U-128)) >> 6
// B = ((Y << 6) + 113 * (U-128)) >> 6
// R = (yy + 90 * vv) >> 6
// G = (yy - 46 * vv - 22 * uu) >> 6
// B = (yy + 113 * uu) >> 6
int v = vuptr[0] - 128;
int u = vuptr[1] - 128;
int ruv = 90 * v;
int guv = -46 * v + -22 * u;
int buv = 113 * u;
int y00 = yptr0[0] << 6;
rgb0[0] = SATURATE_CAST_UCHAR((y00 + ruv) >> 6);
rgb0[1] = SATURATE_CAST_UCHAR((y00 + guv) >> 6);
rgb0[2] = SATURATE_CAST_UCHAR((y00 + buv) >> 6);
int y01 = yptr0[1] << 6;
rgb0[3] = SATURATE_CAST_UCHAR((y01 + ruv) >> 6);
rgb0[4] = SATURATE_CAST_UCHAR((y01 + guv) >> 6);
rgb0[5] = SATURATE_CAST_UCHAR((y01 + buv) >> 6);
int y10 = yptr1[0] << 6;
rgb1[0] = SATURATE_CAST_UCHAR((y10 + ruv) >> 6);
rgb1[1] = SATURATE_CAST_UCHAR((y10 + guv) >> 6);
rgb1[2] = SATURATE_CAST_UCHAR((y10 + buv) >> 6);
int y11 = yptr1[1] << 6;
rgb1[3] = SATURATE_CAST_UCHAR((y11 + ruv) >> 6);
rgb1[4] = SATURATE_CAST_UCHAR((y11 + guv) >> 6);
rgb1[5] = SATURATE_CAST_UCHAR((y11 + buv) >> 6);
yptr0 += 2;
yptr1 += 2;
vuptr += 2;
rgb0 += 6;
rgb1 += 6;
}
#undef SATURATE_CAST_UCHAR
yptr += 2 * w;
rgb += 2 * 3 * w;
}
}
基于G2D圖形硬件的YUV轉RGB
我們先實現 dmaion buffer 管理器,參考
https://github.com/MYIR-ALLWINNER/framework/blob/develop-yt113-framework/auto/sdk_lib/sdk_memory/DmaIon.cpp
這里貼的代碼省略了異常錯誤處理的邏輯,有個坑是 linux-4.9 和 linux-5.4 用法不一樣,米爾電子的這個T113-i系統是linux-5.4,所以不兼容4.9內核的ioctl用法習慣
struct ion_memory
{
size_t size;
int fd;
void* virt_addr;
unsigned int phy_addr;
};
class ion_allocator
{
public:
ion_allocator();
~ion_allocator();
int open();
void close();
int alloc(size_t size, struct ion_memory* mem);
int free(struct ion_memory* mem);
int flush(struct ion_memory* mem);
public:
int ion_fd;
int cedar_fd;
};
ion_allocator::ion_allocator()
{
ion_fd = -1;
cedar_fd = -1;
}
ion_allocator::~ion_allocator()
{
close();
}
int ion_allocator::open()
{
close();
ion_fd = ::open("/dev/ion", O_RDWR);
cedar_fd = ::open("/dev/cedar_dev", O_RDONLY);
ioctl(cedar_fd, IOCTL_ENGINE_REQ, 0);
return 0;
}
void ion_allocator::close()
{
if (cedar_fd != -1)
{
ioctl(cedar_fd, IOCTL_ENGINE_REL, 0);
::close(cedar_fd);
cedar_fd = -1;
}
if (ion_fd != -1)
{
::close(ion_fd);
ion_fd = -1;
}
}
int ion_allocator::alloc(size_t size, struct ion_memory* mem)
{
struct aw_ion_new_alloc_data alloc_data;
alloc_data.len = size;
alloc_data.heap_id_mask = AW_ION_SYSTEM_HEAP_MASK;
alloc_data.flags = AW_ION_CACHED_FLAG | AW_ION_CACHED_NEEDS_SYNC_FLAG;
alloc_data.fd = 0;
alloc_data.unused = 0;
ioctl(ion_fd, AW_ION_IOC_NEW_ALLOC, &alloc_data);
void* virt_addr = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_SHARED, alloc_data.fd, 0);
struct aw_user_iommu_param iommu_param;
iommu_param.fd = alloc_data.fd;
iommu_param.iommu_addr = 0;
ioctl(cedar_fd, IOCTL_GET_IOMMU_ADDR, &iommu_param);
mem->size = size;
mem->fd = alloc_data.fd;
mem->virt_addr = virt_addr;
mem->phy_addr = iommu_param.iommu_addr;
return 0;
}
int ion_allocator::free(struct ion_memory* mem)
{
if (mem->fd == -1)
return 0;
struct aw_user_iommu_param iommu_param;
iommu_param.fd = mem->fd;
ioctl(cedar_fd, IOCTL_FREE_IOMMU_ADDR, &iommu_param);
munmap(mem->virt_addr, mem->size);
::close(mem->fd);
mem->size = 0;
mem->fd = -1;
mem->virt_addr = 0;
mem->phy_addr = 0;
return 0;
}
int ion_allocator::flush(struct ion_memory* mem)
{
struct dma_buf_sync sync;
sync.flags = DMA_BUF_SYNC_END | DMA_BUF_SYNC_RW;
ioctl(mem->fd, DMA_BUF_IOCTL_SYNC, &sync);
return 0;
}
然后再實現 G2D圖形硬件 YUV轉RGB 的轉換器
提前分配好YUV和RGB的dmaion buffer
將YUV數據拷貝到dmaion buffer,flush cache完成同步
配置轉換參數,ioctl調用G2D_CMD_BITBLT_H完成轉換
flush cache完成同步,從dmaion buffer拷貝出RGB數據
釋放dmaion buffer
// 步驟1
ion_allocator ion;
ion.open();
struct ion_memory yuv_ion;
ion.alloc(rgb_size, &rgb_ion);
struct ion_memory rgb_ion;
ion.alloc(yuv_size, &yuv_ion);
int g2d_fd = ::open("/dev/g2d", O_RDWR);
// 步驟2
memcpy((unsigned char*)yuv_ion.virt_addr, yuv420sp, yuv_size);
ion.flush(&yuv_ion);
// 步驟3
g2d_blt_h blit;
memset(&blit, 0, sizeof(blit));
blit.flag_h = G2D_BLT_NONE_H;
blit.src_image_h.format = G2D_FORMAT_YUV420UVC_V1U1V0U0;
blit.src_image_h.width = width;
blit.src_image_h.height = height;
blit.src_image_h.align[0] = 0;
blit.src_image_h.align[1] = 0;
blit.src_image_h.clip_rect.x = 0;
blit.src_image_h.clip_rect.y = 0;
blit.src_image_h.clip_rect.w = width;
blit.src_image_h.clip_rect.h = height;
blit.src_image_h.gamut = G2D_BT601;
blit.src_image_h.bpremul = 0;
blit.src_image_h.mode = G2D_PIXEL_ALPHA;
blit.src_image_h.use_phy_addr = 0;
blit.src_image_h.fd = yuv_ion.fd;
blit.dst_image_h.format = G2D_FORMAT_RGB888;
blit.dst_image_h.width = width;
blit.dst_image_h.height = height;
blit.dst_image_h.align[0] = 0;
blit.dst_image_h.clip_rect.x = 0;
blit.dst_image_h.clip_rect.y = 0;
blit.dst_image_h.clip_rect.w = width;
blit.dst_image_h.clip_rect.h = height;
blit.dst_image_h.gamut = G2D_BT601;
blit.dst_image_h.bpremul = 0;
blit.dst_image_h.mode = G2D_PIXEL_ALPHA;
blit.dst_image_h.use_phy_addr = 0;
blit.dst_image_h.fd = rgb_ion.fd;
ioctl(g2d_fd, G2D_CMD_BITBLT_H, &blit);
// 步驟4
ion.flush(&rgb_ion);
memcpy(rgb, (const unsigned char*)rgb_ion.virt_addr, rgb_size);
// 步驟5
ion.free(&rgb_ion);
ion.free(&yuv_ion);
ion.close();
::close(g2d_fd);
G2D圖像硬件YUV轉RGB測試
考慮到dmaion buffer分配和釋放都比較耗時,我們提前做好,循環(huán)調用步驟3的G2D轉換,統計耗時,并在top工具中查看CPU占用率
sh-4.4# LD_LIBRARY_PATH=. ./g2dtest
INFO : cedarc
this device is not whitelisted for jpeg decoder cvi
this device is not whitelisted for jpeg decoder cvi
this device is not whitelisted for jpeg decoder cvi
this device is not whitelisted for jpeg encoder rkmpp
INFO : cedarc
ERROR : cedarc
ERROR : cedarc
yuv420sp2rgb 46.61
yuv420sp2rgb 42.04
yuv420sp2rgb 41.32
yuv420sp2rgb 42.06
yuv420sp2rgb 41.69
yuv420sp2rgb 42.05
yuv420sp2rgb 41.29
yuv420sp2rgb 41.30
yuv420sp2rgb 42.14
yuv420sp2rgb 41.33
yuv420sp2rgb_neon 10.57
yuv420sp2rgb_neon 7.21
yuv420sp2rgb_neon 6.77
yuv420sp2rgb_neon 8.31
yuv420sp2rgb_neon 7.60
yuv420sp2rgb_neon 6.80
yuv420sp2rgb_neon 6.77
yuv420sp2rgb_neon 7.01
yuv420sp2rgb_neon 7.11
yuv420sp2rgb_neon 7.06
yuv420sp2rgb_g2d 4.32
yuv420sp2rgb_g2d 4.69
yuv420sp2rgb_g2d 4.56
yuv420sp2rgb_g2d 4.57
yuv420sp2rgb_g2d 4.52
yuv420sp2rgb_g2d 4.54
yuv420sp2rgb_g2d 4.52
yuv420sp2rgb_g2d 4.58
yuv420sp2rgb_g2d 4.60
yuv420sp2rgb_g2d 4.67
可以看到 ARM neon 的優(yōu)化效果非常明顯,而使用G2D圖形硬件能獲得進一步加速,并且能顯著降低CPU占用率!
轉換結果對比和分析
C和neon的轉換結果完全一致,但是g2d轉換后的圖片有明顯的色差
G2D圖形硬件只支持 G2D_BT601,G2D_BT709,G2D_BT2020 3種YUV系數,而JPG所使用的YUV系數是改版BT601,因此產生了色差
https://github.com/MYIR-ALLWINNER/myir-t1-kernel/blob/develop-yt113-L5.4.61/drivers/char/sunxi_g2d/g2d_bsp_v2.c
從g2d內核驅動中也可以得知,暫時沒有方法為g2d設置自定義的YUV系數,g2d不適合用于JPG的編解碼,但依然適合攝像頭和視頻編解碼的顏色空間轉換
同類文章排行
- 萬馬高分子助力,國內首條公里級大長度環(huán)保
- 主營產品有哪些?
- 購買后產品發(fā)什么快遞?
- 節(jié)能轉型,電機產業(yè)鏈有哪些變革性機會?
- 更緊湊而高效的機器人世界
- 機器手臂的創(chuàng)新應用:輕薄短小、智能高效
- 產品供貨周期需要多久?
- 當半導體碰上 AMR,來一場智能化的精彩
- 堅持科技是第一生產力
- 通向智能工廠的硬核技術,哪些和你有關?
最新資訊文章
- 英孚康是羅克韋爾的替代品?不止如此
- 歐洲航天局利用MVG設備大幅增強新型 H
- Profinet轉canopen網關連接
- DATALOGIC得利捷 | 物流之眼利
- 施耐德電氣與標領智能裝備強強聯合,共創(chuàng)電
- 【有現貨】KB-LS10N-C KB-L
- 華北工控打造網安專用主板,基于飛騰D20
- PLC通訊革新:EtherNetIP轉P
- 華北工控ATX-6152:高度集成化!提
- 巴斯夫成功完成Ethernet-APL試
- HRPG-1000N3 系列:1000W
- RQB60W12 系列:60W 1/4
- NPB-450-NFC 系列:450W
- VFD 系列:150W~750W 工業(yè)用
- NGE12/18 系列:12W/18W
- 工業(yè)現場ModbusTCP轉EtherN
- DJM / FT系列:12V/38~15
- SI06W8/DI06W8 系列:超寬壓
- NGE100 (U) 系列:100W 環(huán)
- LOP-200/300系列:200W &