访存加速-Speed-up of Memory Access Intensive Program

【参考网址】
关于采用PLD指令进行内存预取减少内存操作指令的等待时间:
https://www.jianshu.com/p/7b3bfc3aed12
关于L1,L2,L3的缓存介绍:
https://www.cnblogs.com/arnoldlu/p/7883663.html
关于乱序处理和顺序处理的解释:(本质上不存在乱序,只是指令之间的依赖关系)
https://www.sohu.com/a/127028459_505803
实际中配合NEON的v_load指令似乎并没有起到任何作用,不仅如此反而有些变慢了。U_U
代码如下:

static inline void prefetch_range(uint8_t *addr, size_t len) { uint8_t *cp; uint8_t *end = addr + len; for (cp = addr; cp < end; cp += 1) __builtin_prefetch(cp, 0, 3); } static inline void prestore_range(uint8_t *addr, size_t len){ uint8_t *cp; uint8_t *end = addr + len; for (cp = addr; cp < end; cp += 1) __builtin_prefetch(cp, 1, 3); } void GetPixelOrder_CPU(cv::Mat & order, const cv::Mat & grey){ uint8_t * data_order = order.data; uint8_t * data_grey = grey.data; if(order.type() != CV_8UC1 || grey.type() != CV_8UC1 || order.size() != grey.size()){ CAP_LOGE("[GetPixelOrder] invalid inputs"); exit(-1); } int h = grey.size().height; int w = grey.size().width; int offset; int idx; uint8_t buf[8]; // window is a 4 x 2 ( width x height ) rectangle uint8_t center; // center is in fact the left corner pixel in window for(int i=0; i<h-1; ++i){ offset = i * w; idx = 6; buf[0] = data_grey[offset]; buf[1] = data_grey[offset + w]; buf[2] = data_grey[offset + 1]; buf[3] = data_grey[offset + w + 1]; buf[4] = data_grey[offset + 2]; buf[5] = data_grey[offset + w + 2]; for(int j=0; j<w-3; ++j){ // update the outdated pixels with new ones coming in by the right side buf[idx] = data_grey[offset + 3]; buf[idx + 1] = data_grey[offset + w + 3]; // update window position ++offset; // update buffer state: position of outdating pixels idx = (idx + 2) % 8; // the center of window is always the up-left one center = buf[idx]; // calculate the order for center point within current window uint8_t counter = 0; for(int k=1; k<8; ++k) counter += (buf[(idx + k)%8] > center); data_order[offset] = counter << 5; } } } void GetPixelOrder_NEON(cv::Mat & order, const cv::Mat & grey){ uint8_t * data_order = order.data; uint8_t * data_grey = grey.data; if(order.type() != CV_8UC1 || grey.type() != CV_8UC1 || order.size() != grey.size()){ CAP_LOGE("[GetPixelOrder] invalid inputs"); exit(-1); } int h = grey.size().height; int w = grey.size().width; int offset; cv::v_uint8x16 v_b_0, v_b_1, v_b_2, v_b_3, v_b_4, v_b_5, v_b_6, v_b_7; // 4(w) x 2(h) window cv::v_uint8x16 v_counter; cv::v_uint8x16 v_flag; int y_end = h - 1; int x_end = w - 15; for(int i=0; i<y_end; ++i){ offset = i * w; for(int j=0; j<x_end; j+=16){ // memory hint instructions: PLD Commands to enable data prefetching //prefetch_range(data_grey + offset, 24); //prefetch_range(data_grey + offset + w, 24); //prestore_range(data_order + offset, 16); // update window v_b_0 = cv::v_load(data_grey + offset); v_b_2 = cv::v_load(data_grey + offset + 1); v_b_4 = cv::v_load(data_grey + offset + 2); v_b_6 = cv::v_load(data_grey + offset + 3); v_b_1 = cv::v_load(data_grey + offset + w); v_b_3 = cv::v_load(data_grey + offset + w + 1); v_b_5 = cv::v_load(data_grey + offset + w + 2); v_b_7 = cv::v_load(data_grey + offset + w + 3); // calculate the order for center point within current window v_counter = cv::v_setall_u8(0); v_flag = v_b_0 < v_b_1; v_flag = v_flag >> 7; v_counter = v_counter + v_flag; v_flag = v_b_0 < v_b_2; v_flag = v_flag >> 7; v_counter = v_counter + v_flag; v_flag = v_b_0 < v_b_3; v_flag = v_flag >> 7; v_counter = v_counter + v_flag; v_flag = v_b_0 < v_b_4; v_flag = v_flag >> 7; v_counter = v_counter + v_flag; v_flag = v_b_0 < v_b_5; v_flag = v_flag >> 7; v_counter = v_counter + v_flag; v_flag = v_b_0 < v_b_6; v_flag = v_flag >> 7; v_counter = v_counter + v_flag; v_flag = v_b_0 < v_b_7; v_flag = v_flag >> 7; v_counter = v_counter + v_flag; v_counter = v_counter << 5; cv::v_store(data_order + offset, v_counter); // update window position offset += 16; } } }
CPU版本:17-26ms(手机运行不稳定) NEON版本:1-2ms(绝大部分时间是1ms) NEON+PLD/PST(预取/存):3-7ms(不稳定) 将预取prefetch_range以及prestore_range加上后稍微稳定些了,但是速度变慢了。 因此还需要认真思考预取的使用策略,使其生效。
posted @ 2019-12-10 16:51  xchk138  阅读(273)  评论(0)    收藏  举报