一个ARM-NEON的demo

之前和炮姐聊天的时候,打开了新世界的大门。手边有一台Macbook Pro M1,就用这台设备写了一个简单的demo。Apple M1 chip 支持ARMv8-a指令集,同时支持NEON拓展指令集。 测试数据基于google benchmark。

RGB deinterleaving

#include "benchmark/benchmark.h"
#include "arm_neon.h"

void rgb_deinterleave_c(uint8_t *r, uint8_t *g, uint8_t *b, uint8_t *rgb, int len_color) {
    /*
     * Take the elements of "rgb" and store the individual colors "r", "g", and "b".
     */
    for (int i=0; i < len_color; i++) {
        r[i] = rgb[3*i];
        g[i] = rgb[3*i+1];
        b[i] = rgb[3*i+2];
    }
}
void rgb_deinterleave_neon(uint8_t *r, uint8_t *g, uint8_t *b, uint8_t *rgb, int len_color) {
    /*
     * Take the elements of "rgb" and store the individual colors "r", "g", and "b"
     */
    int num8x16 = len_color / 16;
    uint8x16x3_t intlv_rgb;
    for (int i=0; i < num8x16; i++) {
        intlv_rgb = vld3q_u8(rgb+3*16*i);
        vst1q_u8(r+16*i, intlv_rgb.val[0]);
        vst1q_u8(g+16*i, intlv_rgb.val[1]);
        vst1q_u8(b+16*i, intlv_rgb.val[2]);
    }
}
#define LEN_COLOR 1000
uint8_t r[LEN_COLOR], g[LEN_COLOR], b[LEN_COLOR], rgb[LEN_COLOR * 3];
void testC(benchmark::State& state){
    for(auto _ : state){
        rgb_deinterleave_c(r, g, b, rgb, LEN_COLOR);
    }
}

BENCHMARK(testC);
void testNeon(benchmark::State& state){
    for(auto _ : state){
        rgb_deinterleave_neon(r, g, b, rgb, LEN_COLOR);
    }
}

BENCHMARK(testNeon);
BENCHMARK_MAIN();

/*
Run on (8 X 24.1206 MHz CPU s)
CPU Caches:
  L1 Data 64 KiB
  L1 Instruction 128 KiB
  L2 Unified 4096 KiB (x8)
Load Average: 2.51, 2.79, 2.99
-----------------------------------------------------
Benchmark           Time             CPU   Iterations
-----------------------------------------------------
testC            1928 ns         1927 ns       350642
testNeon          396 ns          395 ns      1768137
*/

Accumulate Array

#include "benchmark/benchmark.h"
#include "arm_neon.h"

float brr[4] = {0, 0, 0, 0};
float add_result_neon(float* arr, int len){
    float32x4_t tmp = vld1q_f32(brr);
    int num = len / 4;
    for(int i = 0; i < num; ++i){
        float32x4_t flv = vld1q_f32(arr + i * 4);
        tmp += flv;
    }
    float res = 0;
    vst1q_f32(brr, tmp);
    for(float i : brr){
        res += i;
    }
    return res;
}
float add_result_c(float* arr, int len){
    float res = 0;
    for(int i = 0; i < len; ++i){
        res += arr[i];
    }
    return res;
}
#define LEN 1000
float arr[LEN];
void testC(benchmark::State& state){
    for(auto _: state){
        add_result_c(arr, LEN);
    }
}
void testNeon(benchmark::State& state){
    for(auto _: state){
        add_result_neon(arr, LEN);
    }
}

BENCHMARK(testC);
BENCHMARK(testNeon);
BENCHMARK_MAIN();

/*
Run on (8 X 24.1212 MHz CPU s)
CPU Caches:
  L1 Data 64 KiB
  L1 Instruction 128 KiB
  L2 Unified 4096 KiB (x8)
Load Average: 4.72, 4.17, 3.73
-----------------------------------------------------
Benchmark           Time             CPU   Iterations
-----------------------------------------------------
testC            3428 ns         3422 ns       203604
testNeon          873 ns          872 ns       79435
*/

一个ARM-NEON的demo
https://www.dianhsu.com/2022/08/17/neon-test/
Author
Dian Hsu
Posted on
August 17, 2022
Licensed under