#include "benchmark/benchmark.h"
#include "arm_neon.h"
void rgb_deinterleave_c(uint8_t *r, uint8_t *g, uint8_t *b, uint8_t *rgb, int len_color) {
/*
* Take the elements of "rgb" and store the individual colors "r", "g", and "b".
*/
for (int i=0; i < len_color; i++) {
r[i] = rgb[3*i];
g[i] = rgb[3*i+1];
b[i] = rgb[3*i+2];
}
}
void rgb_deinterleave_neon(uint8_t *r, uint8_t *g, uint8_t *b, uint8_t *rgb, int len_color) {
/*
* Take the elements of "rgb" and store the individual colors "r", "g", and "b"
*/
int num8x16 = len_color / 16;
uint8x16x3_t intlv_rgb;
for (int i=0; i < num8x16; i++) {
intlv_rgb = vld3q_u8(rgb+3*16*i);
vst1q_u8(r+16*i, intlv_rgb.val[0]);
vst1q_u8(g+16*i, intlv_rgb.val[1]);
vst1q_u8(b+16*i, intlv_rgb.val[2]);
}
}
#define LEN_COLOR 1000
uint8_t r[LEN_COLOR], g[LEN_COLOR], b[LEN_COLOR], rgb[LEN_COLOR * 3];
void testC(benchmark::State& state){
for(auto _ : state){
rgb_deinterleave_c(r, g, b, rgb, LEN_COLOR);
}
}
BENCHMARK(testC);
void testNeon(benchmark::State& state){
for(auto _ : state){
rgb_deinterleave_neon(r, g, b, rgb, LEN_COLOR);
}
}
BENCHMARK(testNeon);
BENCHMARK_MAIN();
/*
Run on (8 X 24.1206 MHz CPU s)
CPU Caches:
L1 Data 64 KiB
L1 Instruction 128 KiB
L2 Unified 4096 KiB (x8)
Load Average: 2.51, 2.79, 2.99
-----------------------------------------------------
Benchmark Time CPU Iterations
-----------------------------------------------------
testC 1928 ns 1927 ns 350642
testNeon 396 ns 395 ns 1768137
*/