2025年2月24日 星期一 甲辰(龙)年 腊月廿四 设为首页 加入收藏
rss
您当前的位置:首页 > 计算机 > 编程开发 > 安卓(android)开发

Android使用GPU加速JPEG图片解码(Opencl)

时间:09-07来源:作者:点击数:30

1.创建opencl kernel

(1)创建kerenl并编译shading文件

(2)获取kernel工作组信息

2.GPU和CPU协同工作

(1)创建内存,并同步到CPU和GPU,如:

        tablebuffer = opencl_create_mem(context, cinfo->comps_in_scan*DCTSIZE2*sizeof(float));

        opencl_sync_mem(tablebuffer, TOCPU);

        opencl_sync_mem(tablebuffer, TOGPU);

(2)设置GPU kernel参数,如:

        int error;

        size_t alignblocks;

        cl_kernel kernel = jpeg_get_kernel(IDCT_FLOAT);

        opencl_set_mem(kernel, inputBuffer, 0);

        opencl_set_mem(kernel, yuvBuffer, 1);

        opencl_set_mem(kernel, tablebuffer, 2);

        opencl_set_mem(kernel, offsetbuffer, 3);

        error = aclSetKernelArg(kernel, 4, sizeof(int), &cinfo->blocks_in_MCU);

        assert(CL_SUCCESS == error);

        error = aclSetKernelArg(kernel, 5, sizeof(size_t), &totalBlock);

        assert(CL_SUCCESS == error);

(3)加入GPU渲染管线,并等待计算完成,如:

        error = aclEnqueueNDRangeKernel(context->queue, kernel, 1, NULL, &alignblocks, gMaxGroupSize+IDCT_FLOAT, 0, NULL, NULL);

        assert(CL_SUCCESS == error);

        aclFinish(context->queue);

JpegNativeLib.cpp (JNI) :

  • //
  • // Created by Administrator on 2017/1/18.
  • //
  • extern "C"
  • {
  • #include "jpeglib.h"
  • #include <opencl/jpeg_opencl.h>
  • };
  • #include <jni.h>
  • #include <stdio.h>
  • #include <string.h>
  • #include <time.h>
  • #include <unistd.h>
  • #include <android/log.h>
  • #define LOG_TAG "MYLOG"
  • #define LOGI(...) __android_log_print(ANDROID_LOG_INFO,LOG_TAG,__VA_ARGS__)
  • #define LOGE(...) __android_log_print(ANDROID_LOG_ERROR,LOG_TAG,__VA_ARGS__)
  • #include <UsrGLED.h>
  • using namespace UGLES;
  • void preDecode(FILE *infile, jpeg_decompress_struct *cinfo)
  • {
  • struct jpeg_error_mgr jerr;
  • cinfo->err = jpeg_std_error(&jerr);
  • jpeg_stdio_src(cinfo, infile);
  • (void) jpeg_read_header(cinfo, TRUE);
  • /*必须设成float方式,若是cpu解码,这个会影响性能,但对于gpu来说没有关系,而且float方式精度最高,几乎不会造成图片失真*/
  • cinfo->dct_method = JDCT_FLOAT;
  • (void) jpeg_start_decompress(cinfo);
  • auto width = cinfo->output_width;
  • auto height = cinfo->output_height;
  • LOGI("W=%d,H=%d",width,height);
  • }
  • int jpegDecode(GLuint glTextureId,int flag)
  • {
  • LOGI("Java_zhw_jpegclapp_MainActivity_jpegDecode");
  • {
  • #if 1
  • /*opencl 解码的 api*/
  • const char *inputfile = "/sdcard/3040x1520_4.jpeg";
  • struct jpeg_decompress_struct cinfo;
  • jpeg_create_decompress(&cinfo);//创建解码器
  • FILE *infile = fopen(inputfile, "rb");
  • if (NULL == infile) {
  • LOGE("Open input file err.");
  • return -1;
  • }
  • preDecode(infile, &cinfo);
  • LOGI("GPU DECODE START");
  • int ret = jpeg_decode_by_opencl(&cinfo,glTextureId,flag);
  • fclose(infile);
  • if (-1 == ret) {
  • LOGE("jpeg_decode_by_opencl failed");
  • return -1;
  • }
  • LOGI("GPU DECODE END");
  • #endif
  • //用得到的pixels做一些事情
  • //这里用abort而不是finish,直接中止掉
  • (void) jpeg_abort_decompress(&cinfo);
  • jpeg_destroy_decompress(&cinfo);
  • return 0;
  • }
  • }
  • UsrGLES usrGLES;
  • GLuint textureId=0;
  • extern "C"
  • void Java_zhw_jpegclapp_GL2JNILib_init(JNIEnv * env, jobject obj, jint width, jint height)
  • {
  • usrGLES.setupGraphics(width, height);
  • textureId=usrGLES.getTextureID();
  • if (-1 == openclInit()) {
  • LOGE("openclInit failed");
  • return;
  • }
  • if(-1==jpegDecode(textureId,0)) {
  • LOGE("jpegDecode error !");
  • return;
  • }
  • }
  • extern "C"
  • void Java_zhw_jpegclapp_GL2JNILib_step(JNIEnv * env, jobject obj)
  • {
  • TIME_START;
  • if(-1==jpegDecode(textureId,1)) {
  • LOGE("jpegDecode error !");
  • return;
  • }
  • TIME_END;
  • usrGLES.renderFrame();
  • }
  • extern "C"
  • void Java_zhw_jpegclapp_GL2JNILib_stop(JNIEnv * env, jobject obj)
  • {
  • openclDestroy();
  • }

jpeg_opencl.h

  • #ifndef JPEG_OPENCL_H
  • #define JPEG_OPENCL_H
  • #include "opencl_package.h"
  • #include <stdio.h>
  • #include "jpeglib.h"
  • #include "jpegint.h"
  • #define DEUBUG_ON
  • #ifdef DEUBUG_ON
  • #include <sys/time.h>
  • #include <android/log.h>
  • #include <GLES2/gl2.h>
  • #define LOG_TAG "MYLOG"
  • #define LOGI(...) __android_log_print(ANDROID_LOG_INFO,LOG_TAG,__VA_ARGS__)
  • #define LOGE(...) __android_log_print(ANDROID_LOG_ERROR,LOG_TAG,__VA_ARGS__)
  • #define TIME_START struct timeval tv_start, tv_end;gettimeofday(&tv_start, NULL);
  • #define TIME_END gettimeofday(&tv_end, NULL); LOGI("Time = %fms in %s, %d\n", (tv_end.tv_sec*1000000 + tv_end.tv_usec - tv_start.tv_sec*1000000 - tv_start.tv_usec)/1000.0f, __FUNCTION__, __LINE__);gettimeofday(&tv_start, NULL);
  • #else
  • #define TIME_START
  • #define TIME_END
  • #endif
  • opencl_context* jpeg_get_context();
  • typedef enum
  • {
  • IDCT_FLOAT=0,
  • YUV_RGB,
  • KERNELNUMBER
  • }KERNELNAME;
  • cl_kernel jpeg_get_kernel(KERNELNAME name);
  • int jpeg_decode_by_opencl(j_decompress_ptr cinfo, GLuint glTextureId,int flag);
  • int openclInit();
  • void openclDestroy();
  • #endif

jpeg_opencl.c

  • #include "jpeg_opencl.h"
  • #include <pthread.h>
  • #include <assert.h>
  • #include <string.h>
  • #include <stdlib.h>
  • #include <android/log.h>
  • #include <OpenCL/cl_gl.h>
  • #include <unistd.h>
  • #include "aopencl.h"
  • #define LOG_TAG "CL"
  • #define LOGI(...) __android_log_print(ANDROID_LOG_INFO,LOG_TAG,__VA_ARGS__)
  • #define LOGE(...) __android_log_print(ANDROID_LOG_ERROR,LOG_TAG,__VA_ARGS__)
  • static opencl_context* gInstance = NULL;
  • static pthread_mutex_t gMutex = PTHREAD_MUTEX_INITIALIZER;
  • static cl_kernel gKernel[KERNELNUMBER];
  • static size_t gMaxGroupSize[KERNELNUMBER];
  • #include "KernelWarp.h"
  • static void initKernel(opencl_context* context)
  • {
  • int error;
  • gKernel[IDCT_FLOAT] = opencl_compile_create_kernel(context, idct_kernel_clclh, "idct_float");
  • gKernel[YUV_RGB] = opencl_compile_create_kernel(context,yuv_rgb_clclh, "yuv_rgb");
  • LOGI("gInstance->device_id=0x%x",gInstance->device_id);
  • error = aclGetKernelWorkGroupInfo(gKernel[IDCT_FLOAT], gInstance->device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), gMaxGroupSize+IDCT_FLOAT, NULL);
  • LOGI("(gMaxGroupSize+IDCT_FLOAT)=%d",*(gMaxGroupSize+IDCT_FLOAT));
  • assert(CL_SUCCESS == error);
  • error = aclGetKernelWorkGroupInfo(gKernel[YUV_RGB], gInstance->device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), gMaxGroupSize+YUV_RGB, NULL);
  • assert(CL_SUCCESS == error);
  • }
  • opencl_context* jpeg_get_context()
  • {
  • if (NULL == gInstance)
  • {
  • pthread_mutex_lock(&gMutex);
  • if (NULL == gInstance)
  • {
  • gInstance = opencl_create_context(1);
  • initKernel(gInstance);
  • }
  • pthread_mutex_unlock(&gMutex);
  • }
  • return gInstance;
  • }
  • cl_kernel jpeg_get_kernel(KERNELNAME name)
  • {
  • assert(NULL!=gInstance);
  • return gKernel[name];
  • }
  • struct ComponentInfo
  • {
  • int max_x_sample;
  • int max_y_sample;
  • int YW;
  • int YH;
  • int UW;
  • int UH;
  • int UOffset;
  • int VW;
  • int VH;
  • int VOffset;
  • int blocksInMCU;
  • int MCU_Per_Row;
  • };
  • opencl_mem* rgbBuffer;
  • opencl_context* context;
  • opencl_mem* inputBuffer;
  • JBLOCK* mcu;
  • JBLOCK** MCU_buffer;
  • opencl_mem* yuvBuffer;
  • opencl_mem* tablebuffer;
  • opencl_mem* offsetbuffer;
  • int currentoffset = 0;
  • size_t totalBlock;
  • size_t totalMCUNumber;
  • int openclInit()
  • {
  • initFns();
  • TIME_START;
  • /*
  • if (cinfo->dct_method != JDCT_FLOAT || cinfo->comps_in_scan != 3 || DCTSIZE != 8)
  • {
  • return -1;
  • }*/
  • context = jpeg_get_context();
  • TIME_END;
  • return 0;
  • }
  • void openclDestroy()
  • {
  • opencl_destroy_mem(yuvBuffer);
  • opencl_destroy_mem(rgbBuffer);
  • opencl_destroy_mem(inputBuffer);
  • opencl_destroy_context(context);
  • }
  • int gloalValInit(j_decompress_ptr cinfo,GLuint glTextureId)
  • {
  • TIME_START;
  • totalMCUNumber = cinfo->MCU_rows_in_scan * cinfo->MCUs_per_row;
  • totalBlock = totalMCUNumber*cinfo->blocks_in_MCU;
  • for (int i=0;i<3; ++i)
  • {
  • assert(cinfo->cur_comp_info[i]->DCT_scaled_size == 8);
  • }
  • inputBuffer = opencl_create_mem(context, totalBlock * sizeof(JBLOCK));
  • TIME_END;
  • cl_int err;
  • LOGI("glTextureId =%d",glTextureId);
  • rgbBuffer = (opencl_mem *) malloc(sizeof(opencl_mem));
  • rgbBuffer->size = 4 * sizeof(unsigned char) * DCTSIZE2 * cinfo->MCU_rows_in_scan *
  • cinfo->MCUs_per_row * cinfo->max_h_samp_factor * cinfo->max_v_samp_factor;
  • rgbBuffer->queue = context->queue;
  • rgbBuffer->map = NULL;
  • rgbBuffer->base = aclCreateFromGLTexture(context->context, CL_MEM_READ_WRITE, GL_TEXTURE_2D,
  • 0, glTextureId, &err);
  • if (0 != err) {//CL_INVALID_CONTEXT
  • LOGE("clCreateFromGLTexture2D error,%p,err=%d", rgbBuffer->base, err);
  • return -1;
  • }
  • else {
  • LOGI("clCreateFromGLTexture success,rgbBuffer->base=%p", rgbBuffer->base);
  • }
  • yuvBuffer = opencl_create_mem(context, totalMCUNumber*cinfo->blocks_in_MCU * DCTSIZE2 * sizeof(unsigned char));
  • TIME_END;
  • return 0;
  • }
  • int jpeg_decode_by_opencl(j_decompress_ptr cinfo,GLuint glTextureId,int flag)
  • {
  • if(0==flag){
  • if(-1==gloalValInit(cinfo,glTextureId)){
  • return -1;
  • }
  • }
  • TIME_START;
  • opencl_sync_mem(inputBuffer, TOCPU);
  • TIME_END;
  • jzero_far(inputBuffer->map, totalMCUNumber*cinfo->blocks_in_MCU * sizeof(JBLOCK));
  • MCU_buffer = (JBLOCK**)malloc(sizeof(JBLOCK*)*cinfo->blocks_in_MCU);
  • mcu = (JBLOCK*)inputBuffer->map;
  • TIME_END;
  • LOGI("totalMCUNumber=%d",totalMCUNumber);
  • int i, blkn;
  • for (i = 0; i < totalMCUNumber; ++i)
  • {
  • for (blkn = 0; blkn < cinfo->blocks_in_MCU; ++blkn)
  • {
  • MCU_buffer[blkn] = mcu + cinfo->blocks_in_MCU*i + blkn;
  • }
  • if ( FALSE == (*cinfo->entropy->decode_mcu) (cinfo, MCU_buffer))
  • {
  • break;
  • }
  • }
  • free(MCU_buffer);
  • opencl_sync_mem(inputBuffer, TOGPU);
  • TIME_END;
  • /*Upload quantry table*/
  • offsetbuffer = opencl_create_mem(context, cinfo->blocks_in_MCU*sizeof(int));
  • tablebuffer = opencl_create_mem(context, cinfo->comps_in_scan*DCTSIZE2*sizeof(float));
  • opencl_sync_mem(tablebuffer, TOCPU);
  • opencl_sync_mem(offsetbuffer, TOCPU);
  • for (i = 0; i< cinfo->comps_in_scan; ++i)
  • {
  • int j;
  • jpeg_component_info* info = cinfo->cur_comp_info[i];
  • float* table = (float*)info->dct_table;//FIXME
  • float* table_in_buffer = (float*)(tablebuffer->map) + DCTSIZE2*i;
  • int* offset = (int*)offsetbuffer->map;
  • memcpy(table_in_buffer, table, sizeof(float)*DCTSIZE2);
  • for (j=0; j<info->h_samp_factor*info->v_samp_factor; ++j)
  • {
  • offset[currentoffset + j] = i;
  • }
  • currentoffset+=info->h_samp_factor*info->v_samp_factor;
  • }
  • TIME_END;
  • opencl_sync_mem(tablebuffer, TOGPU);
  • opencl_sync_mem(offsetbuffer, TOGPU);
  • /*idct*/
  • {
  • int error;
  • size_t alignblocks;
  • cl_kernel kernel = jpeg_get_kernel(IDCT_FLOAT);
  • opencl_set_mem(kernel, inputBuffer, 0);
  • opencl_set_mem(kernel, yuvBuffer, 1);
  • opencl_set_mem(kernel, tablebuffer, 2);
  • opencl_set_mem(kernel, offsetbuffer, 3);
  • error = aclSetKernelArg(kernel, 4, sizeof(int), &cinfo->blocks_in_MCU);
  • assert(CL_SUCCESS == error);
  • error = aclSetKernelArg(kernel, 5, sizeof(size_t), &totalBlock);
  • assert(CL_SUCCESS == error);
  • alignblocks = (totalBlock + gMaxGroupSize[IDCT_FLOAT]-1)/gMaxGroupSize[IDCT_FLOAT]*gMaxGroupSize[IDCT_FLOAT];
  • error = aclEnqueueNDRangeKernel(context->queue, kernel, 1, NULL, &alignblocks, gMaxGroupSize+IDCT_FLOAT, 0, NULL, NULL);
  • assert(CL_SUCCESS == error);
  • aclFinish(context->queue);
  • }
  • opencl_destroy_mem(tablebuffer);
  • opencl_destroy_mem(offsetbuffer);
  • TIME_END;
  • /*TODO Sample YUV to RGB*/
  • {
  • int stride = cinfo->MCUs_per_row * cinfo->max_h_samp_factor * DCTSIZE;
  • struct ComponentInfo info;
  • size_t global[2] = {cinfo->MCUs_per_row*cinfo->max_h_samp_factor, cinfo->MCU_rows_in_scan*cinfo->max_v_samp_factor*DCTSIZE};
  • cl_kernel kernel = jpeg_get_kernel(YUV_RGB);
  • opencl_sync_mem(rgbBuffer, TOGPU);
  • assert(NULL!=rgbBuffer);
  • info.YW = cinfo->cur_comp_info[0]->h_samp_factor;
  • info.YH = cinfo->cur_comp_info[0]->v_samp_factor;
  • info.UW = cinfo->cur_comp_info[1]->h_samp_factor;
  • info.UH = cinfo->cur_comp_info[1]->v_samp_factor;
  • info.VW = cinfo->cur_comp_info[2]->h_samp_factor;
  • info.VH = cinfo->cur_comp_info[2]->v_samp_factor;
  • info.UOffset = info.YW*info.YH;
  • info.VOffset = info.UOffset + info.UW*info.UH;
  • info.blocksInMCU = cinfo->blocks_in_MCU;
  • info.MCU_Per_Row = cinfo->MCUs_per_row;
  • info.max_x_sample = cinfo->max_h_samp_factor;
  • info.max_y_sample = cinfo->max_v_samp_factor;
  • // LOGI("%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d",info.YW,info.YH,info.UW,info.UH,info.VW,info.VH,info.UOffset,info.VOffset,info.blocksInMCU,info.MCU_Per_Row,info.max_x_sample,info.max_y_sample);
  • // LOGI("stride=%d",stride);
  • {
  • int error ;
  • opencl_set_mem(kernel, yuvBuffer, 0);
  • opencl_set_mem(kernel, rgbBuffer, 1);
  • aclSetKernelArg(kernel, 3, sizeof(int), &info.max_x_sample);
  • aclSetKernelArg(kernel, 4, sizeof(int), &info.max_y_sample);
  • aclSetKernelArg(kernel, 5, sizeof(int), &info.YW);
  • aclSetKernelArg(kernel, 6, sizeof(int), &info.YH);
  • aclSetKernelArg(kernel, 7, sizeof(int), &info.UW);
  • aclSetKernelArg(kernel, 8, sizeof(int), &info.UH);
  • aclSetKernelArg(kernel, 9, sizeof(int), &info.UOffset);
  • aclSetKernelArg(kernel, 10, sizeof(int), &info.VW);
  • aclSetKernelArg(kernel, 11, sizeof(int), &info.VH);
  • aclSetKernelArg(kernel, 12, sizeof(int), &info.VOffset);
  • aclSetKernelArg(kernel, 13, sizeof(int), &info.blocksInMCU);
  • aclSetKernelArg(kernel, 14, sizeof(int), &info.MCU_Per_Row);
  • assert(CL_SUCCESS == error);
  • error = aclSetKernelArg(kernel, 2, sizeof(int), &stride);
  • assert(CL_SUCCESS == error);
  • error = aclEnqueueNDRangeKernel(context->queue, kernel, 2, NULL, global, NULL, 0, NULL, NULL);
  • assert(CL_SUCCESS == error);
  • aclFinish(context->queue);
  • }
  • /*Copy rgbbuffer to output_buf*/
  • TIME_END;
  • }
  • //opencl_destroy_mem(yuvBuffer);
  • //opencl_destroy_mem(rgbBuffer);
  • LOGI("jpeg_decode_by_opencl END");
  • return 1;
  • }

KernelWrap.h (shading程序)

  • const char* idct_kernel_clclh =
  • "#define DCTSIZE 8\n"
  • "#define DCTSIZE2 64\n"
  • "#define LOADSRC(i, src) convert_float8(vload8(i, src))*vload8(i, table)\n"
  • "__kernel void idct_float(__global short* input, __global unsigned char* output, __global const float* dequantilize_table, __global const int* order, int blocks_per_mcu, uint totalblocks)\n"
  • "{\n"
  • " int blkn = get_global_id(0);\n"
  • " if (blkn < totalblocks)\n"
  • " {\n"
  • " __global short* src = input + DCTSIZE2*blkn;\n"
  • " __global unsigned char* outptr = output + DCTSIZE2*blkn;\n"
  • " __global const float* table = dequantilize_table + order[blkn % blocks_per_mcu]*DCTSIZE2;\n"
  • " float8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;\n"
  • " float8 w0, w1, w2, w3, w4, w5, w6, w7;\n"
  • " float8 tmp10, tmp11, tmp12, tmp13;\n"
  • " float8 z5, z10, z11, z12, z13;\n"
  • " tmp0 = LOADSRC(0, src);\n"
  • " tmp1 = LOADSRC(2, src);\n"
  • " tmp2 = LOADSRC(4, src);\n"
  • " tmp3 = LOADSRC(6, src);\n"
  • " tmp10 = tmp0 + tmp2; /* phase 3 */\n"
  • " tmp11 = tmp0 - tmp2;\n"
  • " \n"
  • " tmp13 = tmp1 + tmp3; /* phases 5-3 */\n"
  • " tmp12 = (tmp1 - tmp3) * (float8)1.414213562 - tmp13; /* 2*c4 */\n"
  • " \n"
  • " tmp0 = tmp10 + tmp13; /* phase 2 */\n"
  • " tmp3 = tmp10 - tmp13;\n"
  • " tmp1 = tmp11 + tmp12;\n"
  • " tmp2 = tmp11 - tmp12;\n"
  • " \n"
  • " tmp4 = LOADSRC(1, src);\n"
  • " tmp5 = LOADSRC(3, src);\n"
  • " tmp6 = LOADSRC(5, src);\n"
  • " tmp7 = LOADSRC(7, src);\n"
  • " \n"
  • " z13 = tmp6 + tmp5; /* phase 6 */\n"
  • " z10 = tmp6 - tmp5;\n"
  • " z11 = tmp4 + tmp7;\n"
  • " z12 = tmp4 - tmp7;\n"
  • " \n"
  • " tmp7 = z11 + z13; /* phase 5 */\n"
  • " tmp11 = (z11 - z13) * (float8)(1.414213562); /* 2*c4 */\n"
  • " \n"
  • " z5 = (z10 + z12) * (float8)(1.847759065); /* 2*c2 */\n"
  • " tmp10 = (float8)(1.082392200) * z12 - z5; /* 2*(c2-c6) */\n"
  • " tmp12 = (float8)(-2.613125930) * z10 + z5; /* -2*(c2+c6) */\n"
  • " \n"
  • " tmp6 = tmp12 - tmp7; /* phase 2 */\n"
  • " tmp5 = tmp11 - tmp6;\n"
  • " tmp4 = tmp10 + tmp5;\n"
  • " \n"
  • " tmp0 = tmp0 + tmp7;\n"
  • " tmp7 = tmp0 - (float8)(2)*tmp7;\n"
  • " tmp1 = tmp1 + tmp6;\n"
  • " tmp6 = tmp1 - (float8)(2)*tmp6;\n"
  • " tmp2 = tmp2 + tmp5;\n"
  • " tmp5 = tmp2 - (float8)(2)*tmp5;\n"
  • " tmp4 = tmp3 + tmp4;\n"
  • " tmp3 = (float8)(2)*tmp3 - tmp4;\n"
  • " /*Cross*/\n"
  • "#define TRANS(w, i) w##i = (float8)(tmp0.s##i, tmp1.s##i, tmp2.s##i, tmp3.s##i, tmp4.s##i, tmp5.s##i, tmp6.s##i, tmp7.s##i)\n"
  • " TRANS(w, 0);\n"
  • " TRANS(w, 1);\n"
  • " TRANS(w, 2);\n"
  • " TRANS(w, 3);\n"
  • " TRANS(w, 4);\n"
  • " TRANS(w, 5);\n"
  • " TRANS(w, 6);\n"
  • " TRANS(w, 7);\n"
  • "#undef TRANS\n"
  • " \n"
  • " tmp10 = w0 + w4;\n"
  • " tmp11 = w0 - w4;\n"
  • " \n"
  • " tmp13 = w2 + w6;\n"
  • " tmp12 = (w2 - w6) * (float8)(1.414213562) - tmp13;\n"
  • " \n"
  • " tmp0 = tmp10 + tmp13;\n"
  • " tmp3 = tmp10 - tmp13;\n"
  • " tmp1 = tmp11 + tmp12;\n"
  • " tmp2 = tmp11 - tmp12;\n"
  • " \n"
  • " z13 = w5 + w3;\n"
  • " z10 = w5 - w3;\n"
  • " z11 = w1 + w7;\n"
  • " z12 = w1 - w7;\n"
  • " \n"
  • " tmp7 = z11 + z13;\n"
  • " tmp11 = (z11 - z13) * (float8)(1.414213562);\n"
  • " \n"
  • " z5 = (z10 + z12) * (float8)(1.847759065); /* 2*c2 */\n"
  • " tmp10 = (float8)(1.082392200) * z12 - z5; /* 2*(c2-c6) */\n"
  • " tmp12 = (float8)(-2.613125930) * z10 + z5; /* -2*(c2+c6) */\n"
  • " \n"
  • " tmp6 = tmp12 - tmp7;\n"
  • " tmp5 = tmp11 - tmp6;\n"
  • " tmp4 = tmp10 + tmp5;\n"
  • " \n"
  • " tmp0 = tmp0 + tmp7;\n"
  • " tmp7 = tmp0 - (float8)(2)*tmp7;\n"
  • " tmp1 = tmp1 + tmp6;\n"
  • " tmp6 = tmp1 - (float8)(2)*tmp6;\n"
  • " tmp2 = tmp2 + tmp5;\n"
  • " tmp5 = tmp2 - (float8)(2)*tmp5;\n"
  • " tmp4 = tmp3 + tmp4;\n"
  • " tmp3 = (float8)(2)*tmp3 - tmp4;\n"
  • " /*Cross*/\n"
  • "#define TRANS(w, i) w##i = (float8)(tmp0.s##i, tmp1.s##i, tmp2.s##i, tmp3.s##i, tmp4.s##i, tmp5.s##i, tmp6.s##i, tmp7.s##i)\n"
  • " TRANS(w, 0);\n"
  • " TRANS(w, 1);\n"
  • " TRANS(w, 2);\n"
  • " TRANS(w, 3);\n"
  • " TRANS(w, 4);\n"
  • " TRANS(w, 5);\n"
  • " TRANS(w, 6);\n"
  • " TRANS(w, 7);\n"
  • "#undef TRANS\n"
  • " /* Final output stage: scale down by a factor of 8 and range-limit */\n"
  • "#define RESULT(t) convert_uchar8(clamp((t)/(float8)(8)+(float8)(128), (float8)(0), (float8)(255)))\n"
  • " vstore8(RESULT(w0), 0, outptr);\n"
  • " vstore8(RESULT(w7), 7, outptr);\n"
  • " vstore8(RESULT(w1), 1, outptr);\n"
  • " vstore8(RESULT(w6), 6, outptr);\n"
  • " vstore8(RESULT(w2), 2, outptr);\n"
  • " vstore8(RESULT(w5), 5, outptr);\n"
  • " vstore8(RESULT(w4), 4, outptr);\n"
  • " vstore8(RESULT(w3), 3, outptr);\n"
  • "#undef RESULT\n"
  • " }\n"
  • "}\n"
  • ;
  • const char* yuv_rgb_clclh =
  • "#define DCTSIZE2 64\n"
  • "#define DCTSIZE 8\n"
  • "struct ComponentInfo\n"
  • "{\n"
  • " int max_x_sample;\n"
  • " int max_y_sample;\n"
  • " int YW;\n"
  • " int YH;\n"
  • " int UW;\n"
  • " int UH;\n"
  • " int UOffset;\n"
  • " int VW;\n"
  • " int VH;\n"
  • " int VOffset;\n"
  • " int blocksInMCU;\n"
  • " int MCU_Per_Row;\n"
  • "};\n"
  • "__kernel void yuv_rgb(__global unsigned char* yuvbuffer,__write_only image2d_t rgba, int output_stride,int max_x_sample,int max_y_sample,"
  • "int YW,int YH,int UW,int UH,int UOffset,int VW,int VH,int VOffset,int blocksInMCU,int MCU_Per_Row)\n"
  • "{\n"
  • " struct ComponentInfo info;\n"
  • "info.max_x_sample=max_x_sample;\n"
  • "info.max_y_sample=max_y_sample;\n"
  • "info.YW=YW;\n"
  • "info.YH=YH;\n"
  • "info.UW=UW;\n"
  • "info.UH=UH;\n"
  • "info.UOffset=UOffset;\n"
  • "info.VW=VW;\n"
  • "info.VH=VH;\n"
  • "info.VOffset=VOffset;\n"
  • "info.blocksInMCU=blocksInMCU;\n"
  • "info.MCU_Per_Row=MCU_Per_Row;\n"
  • " int x = get_global_id(0);\n"
  • " int y_origin = get_global_id(1);\n"
  • " int yoffset = y_origin % DCTSIZE;\n"
  • " int y = y_origin/DCTSIZE;\n"
  • " int mcux = x/info.max_x_sample;\n"
  • " int mcuy = y/info.max_y_sample;\n"
  • " __global unsigned char* basic = yuvbuffer + DCTSIZE2*info.blocksInMCU*(info.MCU_Per_Row*mcuy + mcux);\n"
  • " __global unsigned char* Y = basic + DCTSIZE2*((x%info.YW) + 2*(y%info.YH)) + DCTSIZE*yoffset;\n"
  • " __global unsigned char* U = basic + DCTSIZE2*((x%info.UW) + 2*(y%info.UH)+info.UOffset) + DCTSIZE*yoffset;\n"
  • " __global unsigned char* V = basic + DCTSIZE2*((x%info.VW) + 2*(y%info.VH)+info.VOffset) + DCTSIZE*yoffset;\n"
  • " float8 yy = convert_float8(vload8(0, Y));\n"
  • " float8 uu = convert_float8(vload8(0, U)) - (float8)(128);\n"
  • " float8 vv = convert_float8(vload8(0, V)) - (float8)(128);\n"
  • " float8 r, g, b;\n"
  • "#define RESULT(x) (clamp(x, (float8)(0), (float8)(255)))\n"
  • " r = RESULT((yy + (float8)(1.40200)*vv)/(float8)255.0);\n"
  • " g = RESULT((yy - (float8)(0.34414)*uu - (float8)(0.71414)*vv)/(float8)255.0);\n"
  • " b = RESULT((yy + (float8)(1.77200)*uu)/(float8)255.0);\n"
  • "#undef RESULT\n"
  • " int2 coord= (int2)(x*DCTSIZE, y_origin);\n"
  • " float4 color=(float4)(r.s0, g.s0, b.s0, 0);\n"
  • " write_imagef(rgba,coord,color);\n"
  • " coord= (int2)(x*DCTSIZE+1, y_origin);\n"
  • " color=(float4)(r.s1, g.s1, b.s1, 0);\n"
  • " write_imagef(rgba,coord,color);\n"
  • " coord= (int2)(x*DCTSIZE+2, y_origin);\n"
  • " color=(float4)(r.s2, g.s2, b.s2, 0);\n"
  • " write_imagef(rgba,coord,color);\n"
  • " coord= (int2)(x*DCTSIZE+3, y_origin);\n"
  • " color=(float4)(r.s3, g.s3, b.s3, 0);\n"
  • " write_imagef(rgba,coord,color);\n"
  • " coord= (int2)(x*DCTSIZE+4, y_origin);\n"
  • " color=(float4)(r.s4, g.s4, b.s4, 0);\n"
  • " write_imagef(rgba,coord,color);\n"
  • " coord= (int2)(x*DCTSIZE+5, y_origin);\n"
  • " color=(float4)(r.s5, g.s5, b.s5, 0);\n"
  • " write_imagef(rgba,coord,color);\n"
  • " coord= (int2)(x*DCTSIZE+6, y_origin);\n"
  • " color=(float4)(r.s6, g.s6, b.s6,0);\n"
  • " write_imagef(rgba,coord,color);\n"
  • " coord= (int2)(x*DCTSIZE+7, y_origin);\n"
  • " color=(float4)(r.s7, g.s7, b.s7, 0);\n"
  • " write_imagef(rgba,coord,color);\n"
  • "}\n"
  • ;
方便获取更多学习、工作、生活信息请关注本站微信公众号城东书院 微信服务号城东书院 微信订阅号
推荐内容
相关内容
栏目更新
栏目热门
本栏推荐