HarmonyOS系统上neon指令集与c的运行速度对比

​测试HarmonyOS系统上neon指令集的运行速度,与c程序的运行速度做对比

HarmonyOS
2024-05-23 21:52:22
浏览
收藏 0
回答 1
待解决
回答 1
按赞同
/
按时间
kersin

使用的OS功能相关的核心API

#include <arm_neon.h>

核心代码解释

在build-profile.json5的"externalNativeOptions"中配置如下,然后在需要的地方引入

#include <arm_neon.h>就可以使用neon了。

"externalNativeOptions": { 
  "path": "./src/main/cpp/CMakeLists.txt", 
  "arguments": "", 
  "abiFilters": ["arm64-v8a"], 
  "cppFlags": "-mfloat-abi=hard", 
}

neon实例的实现

#include "helloneon-intrinsics.h" 
#include <arm_neon.h> 
 
/* this source file should only be compiled by Android.mk /CMake when targeting 
 * the armeabi-v7a ABI, and should be built in NEON mode 
 */ 
//neon的实现 
void fir_filter_neon_intrinsics(short *output, const short *input, const short *kernel, int width, int kernelSize) { 
#if 1 
    int nn, offset = -kernelSize / 2; 
 
    for (nn = 0; nn < width; nn++) { 
        int mm, sum = 0; 
        int32x4_t sum_vec = vdupq_n_s32(0); 
        for (mm = 0; mm < kernelSize / 4; mm++) { 
            int16x4_t kernel_vec = vld1_s16(kernel + mm * 4); 
            int16x4_t input_vec = vld1_s16(input + (nn + offset + mm * 4)); 
            sum_vec = vmlal_s16(sum_vec, kernel_vec, input_vec); 
        } 
        sum += vgetq_lane_s32(sum_vec, 0); 
        sum += vgetq_lane_s32(sum_vec, 1); 
        sum += vgetq_lane_s32(sum_vec, 2); 
        sum += vgetq_lane_s32(sum_vec, 3); 
 
        if (kernelSize & 3) { 
            for (mm = kernelSize - (kernelSize & 3); mm < kernelSize; mm++) 
                sum += kernel[mm] * input[nn + offset + mm]; 
        } 
        output[nn] = (short)((sum + 0x8000) >> 16); 
    } 
#else /* for comparison purposes only */ 
    int nn, offset = -kernelSize / 2; 
    for (nn = 0; nn < width; nn++) { 
        int sum = 0; 
        int mm; 
        for (mm = 0; mm < kernelSize; mm++) { 
            sum += kernel[mm] * input[nn + offset + mm]; 
        } 
        output[n] = (short)((sum + 0x8000) >> 16); 
    } 
#endif 
}

c的实例的实现

//c的实现 
/* this is a FIR filter implemented in C */ 
static void fir_filter_c(short *output, const short *input, const short *kernel, int width, int kernelSize) { 
    int offset = -kernelSize / 2; 
    int nn; 
    for (nn = 0; nn < width; nn++) { 
        int sum = 0; 
        int mm; 
        for (mm = 0; mm < kernelSize; mm++) { 
            sum += kernel[mm] * input[nn + offset + mm]; 
        } 
        output[nn] = (short)((sum + 0x8000) >> 16); 
    } 
}

主体

#include "napi/native_api.h" 
#include <hilog/log.h> 
#include <stdio.h> 
#include <stdlib.h> 
#include <string.h> 
#include <time.h> 
#include "helloneon-intrinsics.h" 
 
/* return current time in milliseconds */ 
static double now_ms(void) { 
    struct timespec res; 
    clock_gettime(CLOCK_REALTIME, &res); 
    return 1000.0 * res.tv_sec + (double)res.tv_nsec / 1e6; 
} 
//c的实现 
/* this is a FIR filter implemented in C */ 
static void fir_filter_c(short *output, const short *input, const short *kernel, int width, int kernelSize) { 
    int offset = -kernelSize / 2; 
    int nn; 
    for (nn = 0; nn < width; nn++) { 
        int sum = 0; 
        int mm; 
        for (mm = 0; mm < kernelSize; mm++) { 
            sum += kernel[mm] * input[nn + offset + mm]; 
        } 
        output[nn] = (short)((sum + 0x8000) >> 16); 
    } 
} 
#define FIR_KERNEL_SIZE 32 
#define FIR_OUTPUT_SIZE 2560 
#define FIR_INPUT_SIZE (FIR_OUTPUT_SIZE + FIR_KERNEL_SIZE) 
#define FIR_ITERATIONS 600 
 
static const short fir_kernel[FIR_KERNEL_SIZE] = {0x10, 0x20, 0x40, 0x70, 0x8c, 0xa2, 0xce, 0xf0, 0xe9, 0xce, 0xa2, 
                                                  0x8c, 070,  0x40, 0x20, 0x10, 0x10, 0x20, 0x40, 0x70, 0x8c, 0xa2, 
                                                  0xce, 0xf0, 0xe9, 0xce, 0xa2, 0x8c, 070,  0x40, 0x20, 0x10}; 
 
static short fir_output[FIR_OUTPUT_SIZE]; 
static short fir_input_0[FIR_INPUT_SIZE]; 
static const short *fir_input = fir_input_0 + (FIR_KERNEL_SIZE / 2); 
static short fir_output_expected[FIR_OUTPUT_SIZE]; 
 
static napi_value TestNeon(napi_env env, napi_callback_info info) { 
    char *str; 
    char buffer[512]; 
    double t0, t1, time_c, time_neon; 
 
    /* setup FIR input - whatever */ 
    { 
        int nn; 
        for (nn = 0; nn < FIR_INPUT_SIZE; nn++) { 
            fir_input_0[nn] = (5 * nn) & 255; 
        } 
        fir_filter_c(fir_output_expected, fir_input, fir_kernel, FIR_OUTPUT_SIZE, FIR_KERNEL_SIZE); 
    } 
    /* Benchmark small FIR filter loop - C version */ 
    //c的速度 
    t0 = now_ms(); 
    { 
        int count = FIR_ITERATIONS; 
        for (; count > 0; count--) { 
            fir_filter_c(fir_output, fir_input, fir_kernel, FIR_OUTPUT_SIZE, FIR_KERNEL_SIZE); 
        } 
    } 
    t1 = now_ms(); 
    time_c = t1 - t0; 
    asprintf(&str, "FIR Filter benchmark:\nC version          : %g ms\n", time_c); 
    strlcpy(buffer, str, sizeof buffer); 
    free(str); 
    strlcat(buffer, "Neon version   : ", sizeof buffer); 
 
    /* Benchmark small FIR filter loop - Neon version */ 
    //neon指令集的速度 
    t0 = now_ms(); 
    { 
        int count = FIR_ITERATIONS; 
        for (; count > 0; count--) { 
            fir_filter_neon_intrinsics(fir_output, fir_input, fir_kernel, FIR_OUTPUT_SIZE, FIR_KERNEL_SIZE); 
        } 
    } 
    t1 = now_ms(); 
    time_neon = t1 - t0; 
    asprintf(&str, "%g ms (x%g faster)\n", time_neon, time_c / (time_neon < 1e-6 ? 1. : time_neon)); 
    strlcat(buffer, str, sizeof buffer); 
    free(str); 
 
    /* check the result, just in case */ 
    { 
        int nn, fails = 0; 
        for (nn = 0; nn < FIR_OUTPUT_SIZE; nn++) { 
            if (fir_output[nn] != fir_output_expected[nn]) { 
                if (++fails < 16) 
                    OH_LOG_WARN(LOG_APP, "neon[%d] = %d expected %d", nn, fir_output[nn], fir_output_expected[nn]); 
            } 
        } 
        OH_LOG_WARN(LOG_APP, "%d fails\n", fails); 
    } 
    napi_value result; 
    OH_LOG_WARN(LOG_APP, "%s", buffer); 
    napi_create_string_utf8(env, buffer, sizeof buffer, &result); 
 
    return result; 
} 
static napi_value Add(napi_env env, napi_callback_info info) { 
    size_t requireArgc = 2; 
    size_t argc = 2; 
    napi_value args[2] = {nullptr}; 
 
    napi_get_cb_info(env, info, &argc, args, nullptr, nullptr); 
 
    napi_valuetype valuetype0; 
    napi_typeof(env, args[0], &valuetype0); 
 
    napi_valuetype valuetype1; 
    napi_typeof(env, args[1], &valuetype1); 
 
    double value0; 
    napi_get_value_double(env, args[0], &value0); 
 
    double value1; 
    napi_get_value_double(env, args[1], &value1); 
 
    napi_value sum; 
    napi_create_double(env, value0 + value1, &sum); 
 
    return sum; 
} 
EXTERN_C_START 
static napi_value Init(napi_env env, napi_value exports) { 
    napi_property_descriptor desc[] = { 
        {"add", nullptr, Add, nullptr, nullptr, nullptr, napi_default, nullptr}, 
        {"testNeon", nullptr, TestNeon, nullptr, nullptr, nullptr, napi_default, nullptr} 
    }; 
    napi_define_properties(env, exports, sizeof(desc) / sizeof(desc[0]), desc); 
    return exports; 
} 
EXTERN_C_END 
 
static napi_module demoModule = { 
    .nm_version = 1, 
    .nm_flags = 0, 
    .nm_filename = nullptr, 
    .nm_register_func = Init, 
    .nm_modname = "entry", 
    .nm_priv = ((void *)0), 
    .reserved = {0}, 
}; 
 
extern "C" __attribute__((constructor)) void RegisterEntryModule(void) { napi_module_register(&demoModule); }

index.d.ts

export const add: (a: number, b: number) => number; 
export const testNeon: () => string;

oh-package.json5

{ 
  "name": "libentry.so", 
  "types": "./index.d.ts", 
  "version": "", 
  "description": "Please describe the basic information." 
}

CMakeLists.txt

# the minimum version of CMake. 
cmake_minimum_required(VERSION 3.4.1) 
project(MyTestApplicationC) 
set(NATIVERENDER_ROOT_PATH ${CMAKE_CURRENT_SOURCE_DIR}) 
set(neon_SRCS helloneon-intrinsics.cpp) 
set_property(SOURCE ${neon_SRCS} 
               APPEND_STRING PROPERTY COMPILE_FLAGS " -mfpu=neon") 
include_directories(${NATIVERENDER_ROOT_PATH} 
                    ${NATIVERENDER_ROOT_PATH}/include) 
add_library(entry SHARED hello.cpp ${neon_SRCS}) 
target_link_libraries(entry PUBLIC libace_napi.z.so libhilog_ndk.z.so)

ArkUI调用

import hilog from '@ohos.hilog'; 
import testNapi from 'libentry.so'; 
build() { 
  Row() { 
    Column() { 
      Text(this.message) 
        .fontSize(50) 
        .fontWeight(FontWeight.Bold) 
        .onClick(() => { 
          hilog.info(0x0000, 'testTag', '%{public}s', "xxx"); 
          hilog.info(0x0000, 'testTag', '%{public}s', testNapi.testNeon()); 
          hilog.info(0x0000, 'testTag', '%{public}s', "--------"); 
        }) 
    } 
    .width('100%') 
  } 
  .height('100%') 
}

适配版本

DevEco Studio Version: 4.0.1.601

SDK:HarmoneyOS 4.0.10.11

分享
微博
QQ
微信
回复
2024-05-24 21:28:40
相关问题
HarmonyOS 如何加快编译运行速度
105浏览 • 1回复 待解决
HarmonyOS ArkTSC/C++交互
113浏览 • 1回复 待解决
元服务api和应用api 如何区分
1238浏览 • 1回复 待解决
如何实现ArkTSC/C++数组转换
272浏览 • 1回复 待解决
如何实现ArkTSC/C++HashMap转换?
703浏览 • 0回复 待解决
C++源码如何编译到HarmonyOS使用
117浏览 • 1回复 待解决
如何实现ArkTSC/C++对象传递
205浏览 • 1回复 待解决
如何在IDE中运行c语言helloworld?
2852浏览 • 1回复 待解决
HarmonyOS AvPlayer视频播放速度问题
113浏览 • 1回复 待解决
HarmonyOS rsa解密速度过于缓慢
140浏览 • 1回复 待解决
ArkTSC++互相直接调用
1117浏览 • 1回复 待解决
Worker对比TaskPool有什么优势?
191浏览 • 1回复 待解决