refactor(vd960Loop): 算法回退到 DLD154V4B,四通道适配

- 用 DLD154V4B vd1_task/per_channel 替换 vds_task 复杂算法
- 移除 FUNCTION_B/二次判断/快速变化/多重确认等增强特性
- 保留平坦性离开算法 (CN200910309382),每通道独立状态
- 灵敏度表改为 DLD154V4B 4级: {216,108,36,10} / {108,72,18,9}
- 清理废弃类型: FltHistoryManager, Loop_ACS_Info, StageRangeConfig 等
- 首次添加 vd960DBN 完整源码
This commit is contained in:
wangfq
2026-06-25 16:21:57 +08:00
parent 6fd4e564e3
commit 95808f9f25
966 changed files with 406958 additions and 84 deletions

View File

@@ -0,0 +1,414 @@
/*
* Copyright (c) 2016, 2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#ifndef __ARM_COMPUTE_NEMATH_H__
#define __ARM_COMPUTE_NEMATH_H__
#if defined(ARM_MATH_NEON)
/** Calculate floor of a vector.
*
* @param[in] val Input vector value in F32 format.
*
* @return The calculated floor vector.
*/
static inline float32x4_t vfloorq_f32(float32x4_t val);
/** Calculate inverse square root.
*
* @param[in] x Input value.
*
* @return The calculated inverse square root.
*/
static inline float32x2_t vinvsqrt_f32(float32x2_t x);
/** Calculate inverse square root.
*
* @param[in] x Input value.
*
* @return The calculated inverse square root.
*/
static inline float32x4_t vinvsqrtq_f32(float32x4_t x);
/** Calculate reciprocal.
*
* @param[in] x Input value.
*
* @return The calculated reciprocal.
*/
static inline float32x2_t vinv_f32(float32x2_t x);
/** Calculate reciprocal.
*
* @param[in] x Input value.
*
* @return The calculated reciprocal.
*/
static inline float32x4_t vinvq_f32(float32x4_t x);
/** Perform a 7th degree polynomial approximation using Estrin's method.
*
* @param[in] x Input vector value in F32 format.
* @param[in] coeffs Polynomial coefficients table. (array of flattened float32x4_t vectors)
*
* @return The calculated approximation.
*/
static inline float32x4_t vtaylor_polyq_f32(float32x4_t x, const float32_t *coeffs);
/** Calculate exponential
*
* @param[in] x Input vector value in F32 format.
*
* @return The calculated exponent.
*/
static inline float32x4_t vexpq_f32(float32x4_t x);
/** Calculate logarithm
*
* @param[in] x Input vector value in F32 format.
*
* @return The calculated logarithm.
*/
static inline float32x4_t vlogq_f32(float32x4_t x);
/** Calculate hyperbolic tangent.
*
* tanh(x) = (e^2x - 1)/(e^2x + 1)
*
* @note We clamp x to [-5,5] to avoid overflowing issues.
*
* @param[in] val Input vector value in F32 format.
*
* @return The calculated Hyperbolic Tangent.
*/
static inline float32x4_t vtanhq_f32(float32x4_t val);
/** Calculate n power of a number.
*
* pow(x,n) = e^(n*log(x))
*
* @param[in] val Input vector value in F32 format.
* @param[in] n Powers to raise the input to.
*
* @return The calculated power.
*/
static inline float32x4_t vpowq_f32(float32x4_t val, float32x4_t n);
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
/** Calculate hyperbolic tangent.
*
* tanh(x) = (e^2x - 1)/(e^2x + 1)
*
* @note We clamp x to [-5,5] to avoid overflowing issues.
*
* @param[in] val Input vector value in F32 format.
*
* @return The calculated Hyperbolic Tangent.
*/
static inline float16x8_t vtanhq_f16(float16x8_t val);
/** Calculate reciprocal.
*
* @param[in] x Input value.
*
* @return The calculated reciprocal.
*/
static inline float16x4_t vinv_f16(float16x4_t x);
/** Calculate reciprocal.
*
* @param[in] x Input value.
*
* @return The calculated reciprocal.
*/
static inline float16x8_t vinvq_f16(float16x8_t x);
/** Calculate inverse square root.
*
* @param[in] x Input value.
*
* @return The calculated inverse square root.
*/
static inline float16x4_t vinvsqrt_f16(float16x4_t x);
/** Calculate inverse square root.
*
* @param[in] x Input value.
*
* @return The calculated inverse square root.
*/
static inline float16x8_t vinvsqrtq_f16(float16x8_t x);
/** Calculate exponential
*
* @param[in] x Input vector value in F16 format.
*
* @return The calculated exponent.
*/
static inline float16x8_t vexpq_f16(float16x8_t x);
/** Calculate n power of a number.
*
* pow(x,n) = e^(n*log(x))
*
* @param[in] val Input vector value in F16 format.
* @param[in] n Powers to raise the input to.
*
* @return The calculated power.
*/
static inline float16x8_t vpowq_f16(float16x8_t val, float16x8_t n);
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
/** Exponent polynomial coefficients */
extern const float32_t exp_tab[4*8];
/** Logarithm polynomial coefficients */
extern const float32_t log_tab[4*8];
#ifndef DOXYGEN_SKIP_THIS
inline float32x4_t vfloorq_f32(float32x4_t val)
{
static const float32_t CONST_1[4] = {1.f,1.f,1.f,1.f};
const int32x4_t z = vcvtq_s32_f32(val);
const float32x4_t r = vcvtq_f32_s32(z);
return vbslq_f32(vcgtq_f32(r, val), vsubq_f32(r, vld1q_f32(CONST_1)), r);
}
inline float32x2_t vinvsqrt_f32(float32x2_t x)
{
float32x2_t sqrt_reciprocal = vrsqrte_f32(x);
sqrt_reciprocal = vmul_f32(vrsqrts_f32(vmul_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
sqrt_reciprocal = vmul_f32(vrsqrts_f32(vmul_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
return sqrt_reciprocal;
}
inline float32x4_t vinvsqrtq_f32(float32x4_t x)
{
float32x4_t sqrt_reciprocal = vrsqrteq_f32(x);
sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
return sqrt_reciprocal;
}
inline float32x2_t vinv_f32(float32x2_t x)
{
float32x2_t recip = vrecpe_f32(x);
recip = vmul_f32(vrecps_f32(x, recip), recip);
recip = vmul_f32(vrecps_f32(x, recip), recip);
return recip;
}
inline float32x4_t vinvq_f32(float32x4_t x)
{
float32x4_t recip = vrecpeq_f32(x);
recip = vmulq_f32(vrecpsq_f32(x, recip), recip);
recip = vmulq_f32(vrecpsq_f32(x, recip), recip);
return recip;
}
inline float32x4_t vtaylor_polyq_f32(float32x4_t x, const float32_t *coeffs)
{
float32x4_t A = vmlaq_f32(vld1q_f32(&coeffs[4*0]), vld1q_f32(&coeffs[4*4]), x);
float32x4_t B = vmlaq_f32(vld1q_f32(&coeffs[4*2]), vld1q_f32(&coeffs[4*6]), x);
float32x4_t C = vmlaq_f32(vld1q_f32(&coeffs[4*1]), vld1q_f32(&coeffs[4*5]), x);
float32x4_t D = vmlaq_f32(vld1q_f32(&coeffs[4*3]), vld1q_f32(&coeffs[4*7]), x);
float32x4_t x2 = vmulq_f32(x, x);
float32x4_t x4 = vmulq_f32(x2, x2);
float32x4_t res = vmlaq_f32(vmlaq_f32(A, B, x2), vmlaq_f32(C, D, x2), x4);
return res;
}
inline float32x4_t vexpq_f32(float32x4_t x)
{
static const float32_t CONST_LN2[4] = {0.6931471805f,0.6931471805f,0.6931471805f,0.6931471805f}; // ln(2)
static const float32_t CONST_INV_LN2[4] = {1.4426950408f,1.4426950408f,1.4426950408f,1.4426950408f}; // 1/ln(2)
static const float32_t CONST_0[4] = {0.f,0.f,0.f,0.f};
static const int32_t CONST_NEGATIVE_126[4] = {-126,-126,-126,-126};
// Perform range reduction [-log(2),log(2)]
int32x4_t m = vcvtq_s32_f32(vmulq_f32(x, vld1q_f32(CONST_INV_LN2)));
float32x4_t val = vmlsq_f32(x, vcvtq_f32_s32(m), vld1q_f32(CONST_LN2));
// Polynomial Approximation
float32x4_t poly = vtaylor_polyq_f32(val, exp_tab);
// Reconstruct
poly = vreinterpretq_f32_s32(vqaddq_s32(vreinterpretq_s32_f32(poly), vqshlq_n_s32(m, 23)));
poly = vbslq_f32(vcltq_s32(m, vld1q_s32(CONST_NEGATIVE_126)), vld1q_f32(CONST_0), poly);
return poly;
}
inline float32x4_t vlogq_f32(float32x4_t x)
{
static const int32_t CONST_127[4] = {127,127,127,127}; // 127
static const float32_t CONST_LN2[4] = {0.6931471805f,0.6931471805f,0.6931471805f,0.6931471805f}; // ln(2)
// Extract exponent
int32x4_t m = vsubq_s32(vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_f32(x), 23)), vld1q_s32(CONST_127));
float32x4_t val = vreinterpretq_f32_s32(vsubq_s32(vreinterpretq_s32_f32(x), vshlq_n_s32(m, 23)));
// Polynomial Approximation
float32x4_t poly = vtaylor_polyq_f32(val, log_tab);
// Reconstruct
poly = vmlaq_f32(poly, vcvtq_f32_s32(m), vld1q_f32(CONST_LN2));
return poly;
}
inline float32x4_t vtanhq_f32(float32x4_t val)
{
static const float32_t CONST_1[4] = {1.f,1.f,1.f,1.f};
static const float32_t CONST_2[4] = {2.f,2.f,2.f,2.f};
static const float32_t CONST_MIN_TANH[4] = {-10.f,-10.f,-10.f,-10.f};
static const float32_t CONST_MAX_TANH[4] = {10.f,10.f,10.f,10.f};
float32x4_t x = vminq_f32(vmaxq_f32(val, vld1q_f32(CONST_MIN_TANH)), vld1q_f32(CONST_MAX_TANH));
float32x4_t exp2x = vexpq_f32(vmulq_f32(vld1q_f32(CONST_2), x));
float32x4_t num = vsubq_f32(exp2x, vld1q_f32(CONST_1));
float32x4_t den = vaddq_f32(exp2x, vld1q_f32(CONST_1));
float32x4_t tanh = vmulq_f32(num, vinvq_f32(den));
return tanh;
}
inline float32x4_t vpowq_f32(float32x4_t val, float32x4_t n)
{
return vexpq_f32(vmulq_f32(n, vlogq_f32(val)));
}
#endif /* DOXYGEN_SKIP_THIS */
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
/** Exponent polynomial coefficients */
/** Logarithm polynomial coefficients */
#ifndef DOXYGEN_SKIP_THIS
inline float16x8_t vfloorq_f16(float16x8_t val)
{
static const float16_t CONST_1[8] = {1.f,1.f,1.f,1.f,1.f,1.f,1.f,1.f};
const int16x8_t z = vcvtq_s16_f16(val);
const float16x8_t r = vcvtq_f16_s16(z);
return vbslq_f16(vcgtq_f16(r, val), vsubq_f16(r, vld1q_f16(CONST_1)), r);
}
inline float16x4_t vinvsqrt_f16(float16x4_t x)
{
float16x4_t sqrt_reciprocal = vrsqrte_f16(x);
sqrt_reciprocal = vmul_f16(vrsqrts_f16(vmul_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
sqrt_reciprocal = vmul_f16(vrsqrts_f16(vmul_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
return sqrt_reciprocal;
}
inline float16x8_t vinvsqrtq_f16(float16x8_t x)
{
float16x8_t sqrt_reciprocal = vrsqrteq_f16(x);
sqrt_reciprocal = vmulq_f16(vrsqrtsq_f16(vmulq_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
sqrt_reciprocal = vmulq_f16(vrsqrtsq_f16(vmulq_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
return sqrt_reciprocal;
}
inline float16x4_t vinv_f16(float16x4_t x)
{
float16x4_t recip = vrecpe_f16(x);
recip = vmul_f16(vrecps_f16(x, recip), recip);
recip = vmul_f16(vrecps_f16(x, recip), recip);
return recip;
}
inline float16x8_t vinvq_f16(float16x8_t x)
{
float16x8_t recip = vrecpeq_f16(x);
recip = vmulq_f16(vrecpsq_f16(x, recip), recip);
recip = vmulq_f16(vrecpsq_f16(x, recip), recip);
return recip;
}
inline float16x8_t vtanhq_f16(float16x8_t val)
{
const float16_t CONST_1[8] = {1.f,1.f,1.f,1.f,1.f,1.f,1.f,1.f};
const float16_t CONST_2[8] = {2.f,2.f,2.f,2.f,2.f,2.f,2.f,2.f};
const float16_t CONST_MIN_TANH[8] = {-10.f,-10.f,-10.f,-10.f,-10.f,-10.f,-10.f,-10.f};
const float16_t CONST_MAX_TANH[8] = {10.f,10.f,10.f,10.f,10.f,10.f,10.f,10.f};
const float16x8_t x = vminq_f16(vmaxq_f16(val, vld1q_f16(CONST_MIN_TANH)), vld1q_f16(CONST_MAX_TANH));
const float16x8_t exp2x = vexpq_f16(vmulq_f16(vld1q_f16(CONST_2), x));
const float16x8_t num = vsubq_f16(exp2x, vld1q_f16(CONST_1));
const float16x8_t den = vaddq_f16(exp2x, vld1q_f16(CONST_1));
const float16x8_t tanh = vmulq_f16(num, vinvq_f16(den));
return tanh;
}
inline float16x8_t vtaylor_polyq_f16(float16x8_t x, const float16_t *coeffs)
{
const float16x8_t A = vaddq_f16(vld1q_f16(&coeffs[8*0]), vmulq_f16(vld1q_f16(&coeffs[8*4]), x));
const float16x8_t B = vaddq_f16(vld1q_f16(&coeffs[8*2]), vmulq_f16(vld1q_f16(&coeffs[8*6]), x));
const float16x8_t C = vaddq_f16(vld1q_f16(&coeffs[8*1]), vmulq_f16(vld1q_f16(&coeffs[8*5]), x));
const float16x8_t D = vaddq_f16(vld1q_f16(&coeffs[8*3]), vmulq_f16(vld1q_f16(&coeffs[8*7]), x));
const float16x8_t x2 = vmulq_f16(x, x);
const float16x8_t x4 = vmulq_f16(x2, x2);
const float16x8_t res = vaddq_f16(vaddq_f16(A, vmulq_f16(B, x2)), vmulq_f16(vaddq_f16(C, vmulq_f16(D, x2)), x4));
return res;
}
inline float16x8_t vexpq_f16(float16x8_t x)
{
// TODO (COMPMID-1535) : Revisit FP16 approximations
const float32x4_t x_high = vcvt_f32_f16(vget_high_f16(x));
const float32x4_t x_low = vcvt_f32_f16(vget_low_f16(x));
const float16x8_t res = vcvt_high_f16_f32(vcvt_f16_f32(vexpq_f32(x_low)), vexpq_f32(x_high));
return res;
}
inline float16x8_t vlogq_f16(float16x8_t x)
{
// TODO (COMPMID-1535) : Revisit FP16 approximations
const float32x4_t x_high = vcvt_f32_f16(vget_high_f16(x));
const float32x4_t x_low = vcvt_f32_f16(vget_low_f16(x));
const float16x8_t res = vcvt_high_f16_f32(vcvt_f16_f32(vlogq_f32(x_low)), vlogq_f32(x_high));
return res;
}
inline float16x8_t vpowq_f16(float16x8_t val, float16x8_t n)
{
// TODO (giaiod01) - COMPMID-1535
float32x4_t n0_f32 = vcvt_f32_f16(vget_low_f16(n));
float32x4_t n1_f32 = vcvt_f32_f16(vget_high_f16(n));
float32x4_t val0_f32 = vcvt_f32_f16(vget_low_f16(val));
float32x4_t val1_f32 = vcvt_f32_f16(vget_high_f16(val));
float32x4_t res0_f32 = vexpq_f32(vmulq_f32(n0_f32, vlogq_f32(val0_f32)));
float32x4_t res1_f32 = vexpq_f32(vmulq_f32(n1_f32, vlogq_f32(val1_f32)));
return vcombine_f16(vcvt_f16_f32(res0_f32), vcvt_f16_f32(res1_f32));
}
#endif /* DOXYGEN_SKIP_THIS */
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
#endif
#endif /* __ARM_COMPUTE_NEMATH_H__ */

View File

@@ -0,0 +1,19 @@
README
======
This folder is containing two files imported, and slightly modified, from the ComputeLibrary:
NEMath.h and arm_cl_tables.c
In the original compute library, there are instead two other files:
NEMath.h and NEMath.inl
NEMath.inl is included from NEMath.h whereas in this CMSIS DSP implementation, there is no NEMath.inl and its content is copied into NEMath.h
The tables contained in NEMath.inl have been moved to arm_cl_tables.c and finally the files are in C for the CMSIS DSP library and in C++ in the original Compute Library.
Otherwise, the features and implementations are the same : a few optimized Neon functions.
The license covering those files is different : It is a MIT license.
Other parts of the CMSIS-DSP are covered with an Apache-2.0 license.

View File

@@ -0,0 +1,55 @@
/*
* Copyright (c) 2016, 2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include "arm_math.h"
#include "NEMath.h"
#if defined(ARM_MATH_NEON)
/** Exponent polynomial coefficients */
const float32_t exp_tab[4*8] =
{
1.f,1.f,1.f,1.f,
0.0416598916054f,0.0416598916054f,0.0416598916054f,0.0416598916054f,
0.500000596046f,0.500000596046f,0.500000596046f,0.500000596046f,
0.0014122662833f,0.0014122662833f,0.0014122662833f,0.0014122662833f,
1.00000011921f,1.00000011921f,1.00000011921f,1.00000011921f,
0.00833693705499f,0.00833693705499f,0.00833693705499f,0.00833693705499f,
0.166665703058f,0.166665703058f,0.166665703058f,0.166665703058f,
0.000195780929062f,0.000195780929062f,0.000195780929062f,0.000195780929062f
};
/** Logarithm polynomial coefficients */
const float32_t log_tab[4*8] =
{
-2.29561495781f,-2.29561495781f,-2.29561495781f,-2.29561495781f,
-2.47071170807f,-2.47071170807f,-2.47071170807f,-2.47071170807f,
-5.68692588806f,-5.68692588806f,-5.68692588806f,-5.68692588806f,
-0.165253549814f,-0.165253549814f,-0.165253549814f,-0.165253549814f,
5.17591238022f,5.17591238022f,5.17591238022f,5.17591238022f,
0.844007015228f,0.844007015228f,0.844007015228f,0.844007015228f,
4.58445882797f,4.58445882797f,4.58445882797f,4.58445882797f,
0.0141278216615f,0.0141278216615f,0.0141278216615f,0.0141278216615f
};
#endif

View File

@@ -0,0 +1,200 @@
/******************************************************************************
* @file arm_sorting.h
* @brief Private header file for CMSIS DSP Library
* @version V1.7.0
* @date 2019
******************************************************************************/
/*
* Copyright (c) 2010-2019 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef _ARM_SORTING_H_
#define _ARM_SORTING_H_
#include "arm_math.h"
#ifdef __cplusplus
extern "C"
{
#endif
/**
* @param[in] S points to an instance of the sorting structure.
* @param[in] pSrc points to the block of input data.
* @param[out] pDst points to the block of output data.
* @param[in] blockSize number of samples to process.
*/
void arm_bubble_sort_f32(
const arm_sort_instance_f32 * S,
float32_t * pSrc,
float32_t * pDst,
uint32_t blockSize);
/**
* @param[in] S points to an instance of the sorting structure.
* @param[in] pSrc points to the block of input data.
* @param[out] pDst points to the block of output data.
* @param[in] blockSize number of samples to process.
*/
void arm_heap_sort_f32(
const arm_sort_instance_f32 * S,
float32_t * pSrc,
float32_t * pDst,
uint32_t blockSize);
/**
* @param[in] S points to an instance of the sorting structure.
* @param[in] pSrc points to the block of input data.
* @param[out] pDst points to the block of output data.
* @param[in] blockSize number of samples to process.
*/
void arm_insertion_sort_f32(
const arm_sort_instance_f32 * S,
float32_t *pSrc,
float32_t* pDst,
uint32_t blockSize);
/**
* @param[in] S points to an instance of the sorting structure.
* @param[in] pSrc points to the block of input data.
* @param[out] pDst points to the block of output data
* @param[in] blockSize number of samples to process.
*/
void arm_quick_sort_f32(
const arm_sort_instance_f32 * S,
float32_t * pSrc,
float32_t * pDst,
uint32_t blockSize);
/**
* @param[in] S points to an instance of the sorting structure.
* @param[in] pSrc points to the block of input data.
* @param[out] pDst points to the block of output data
* @param[in] blockSize number of samples to process.
*/
void arm_selection_sort_f32(
const arm_sort_instance_f32 * S,
float32_t * pSrc,
float32_t * pDst,
uint32_t blockSize);
/**
* @param[in] S points to an instance of the sorting structure.
* @param[in] pSrc points to the block of input data.
* @param[out] pDst points to the block of output data
* @param[in] blockSize number of samples to process.
*/
void arm_bitonic_sort_f32(
const arm_sort_instance_f32 * S,
float32_t * pSrc,
float32_t * pDst,
uint32_t blockSize);
#if defined(ARM_MATH_NEON)
#define vtrn256_128q(a, b) \
do { \
float32x4_t vtrn128_temp = a.val[1]; \
a.val[1] = b.val[0]; \
b.val[0] = vtrn128_temp ; \
} while (0)
#define vtrn128_64q(a, b) \
do { \
float32x2_t ab, cd, ef, gh; \
ab = vget_low_f32(a); \
ef = vget_low_f32(b); \
cd = vget_high_f32(a); \
gh = vget_high_f32(b); \
a = vcombine_f32(ab, ef); \
b = vcombine_f32(cd, gh); \
} while (0)
#define vtrn256_64q(a, b) \
do { \
float32x2_t a_0, a_1, a_2, a_3; \
float32x2_t b_0, b_1, b_2, b_3; \
a_0 = vget_low_f32(a.val[0]); \
a_1 = vget_high_f32(a.val[0]); \
a_2 = vget_low_f32(a.val[1]); \
a_3 = vget_high_f32(a.val[1]); \
b_0 = vget_low_f32(b.val[0]); \
b_1 = vget_high_f32(b.val[0]); \
b_2 = vget_low_f32(b.val[1]); \
b_3 = vget_high_f32(b.val[1]); \
a.val[0] = vcombine_f32(a_0, b_0); \
a.val[1] = vcombine_f32(a_2, b_2); \
b.val[0] = vcombine_f32(a_1, b_1); \
b.val[1] = vcombine_f32(a_3, b_3); \
} while (0)
#define vtrn128_32q(a, b) \
do { \
float32x4x2_t vtrn32_tmp = vtrnq_f32((a), (b)); \
(a) = vtrn32_tmp.val[0]; \
(b) = vtrn32_tmp.val[1]; \
} while (0)
#define vtrn256_32q(a, b) \
do { \
float32x4x2_t vtrn32_tmp_1 = vtrnq_f32((a.val[0]), (b.val[0])); \
float32x4x2_t vtrn32_tmp_2 = vtrnq_f32((a.val[1]), (b.val[1])); \
a.val[0] = vtrn32_tmp_1.val[0]; \
a.val[1] = vtrn32_tmp_2.val[0]; \
b.val[0] = vtrn32_tmp_1.val[1]; \
b.val[1] = vtrn32_tmp_2.val[1]; \
} while (0)
#define vminmaxq(a, b) \
do { \
float32x4_t minmax_tmp = (a); \
(a) = vminq_f32((a), (b)); \
(b) = vmaxq_f32(minmax_tmp, (b)); \
} while (0)
#define vminmax256q(a, b) \
do { \
float32x4x2_t minmax256_tmp = (a); \
a.val[0] = vminq_f32(a.val[0], b.val[0]); \
a.val[1] = vminq_f32(a.val[1], b.val[1]); \
b.val[0] = vmaxq_f32(minmax256_tmp.val[0], b.val[0]); \
b.val[1] = vmaxq_f32(minmax256_tmp.val[1], b.val[1]); \
} while (0)
#define vrev128q_f32(a) \
vcombine_f32(vrev64_f32(vget_high_f32(a)), vrev64_f32(vget_low_f32(a)))
#define vrev256q_f32(a) \
do { \
float32x4_t rev_tmp = vcombine_f32(vrev64_f32(vget_high_f32(a.val[0])), vrev64_f32(vget_low_f32(a.val[0]))); \
a.val[0] = vcombine_f32(vrev64_f32(vget_high_f32(a.val[1])), vrev64_f32(vget_low_f32(a.val[1]))); \
a.val[1] = rev_tmp; \
} while (0)
#define vldrev128q_f32(a, p) \
do { \
a = vld1q_f32(p); \
a = vrev128q_f32(a); \
} while (0)
#endif /* ARM_MATH_NEON */
#ifdef __cplusplus
}
#endif
#endif /* _ARM_SORTING_H */

View File

@@ -0,0 +1,325 @@
/******************************************************************************
* @file arm_vec_fft.h
* @brief Private header file for CMSIS DSP Library
* @version V1.7.0
* @date 07. January 2020
******************************************************************************/
/*
* Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef _ARM_VEC_FFT_H_
#define _ARM_VEC_FFT_H_
#include "arm_math.h"
#include "arm_helium_utils.h"
#ifdef __cplusplus
extern "C"
{
#endif
#if (defined(ARM_MATH_MVEF) || defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)
#define MVE_CMPLX_ADD_A_ixB(A, B) vcaddq_rot90(A,B)
#define MVE_CMPLX_SUB_A_ixB(A,B) vcaddq_rot270(A,B)
#define MVE_CMPLX_MULT_FLT_AxB(A,B) vcmlaq_rot90(vcmulq(A, B), A, B)
#define MVE_CMPLX_MULT_FLT_Conj_AxB(A,B) vcmlaq_rot270(vcmulq(A, B), A, B)
#define MVE_CMPLX_MULT_FX_AxB(A,B,TyA) vqdmladhxq(vqdmlsdhq((TyA)vuninitializedq_s32(), A, B), A, B)
#define MVE_CMPLX_MULT_FX_AxConjB(A,B,TyA) vqdmladhq(vqdmlsdhxq((TyA)vuninitializedq_s32(), A, B), A, B)
#define MVE_CMPLX_ADD_FX_A_ixB(A, B) vhcaddq_rot90(A,B)
#define MVE_CMPLX_SUB_FX_A_ixB(A,B) vhcaddq_rot270(A,B)
/**
@brief In-place 32 bit reversal function for helium
@param[in,out] pSrc points to in-place buffer of unknown 32-bit data type
@param[in] bitRevLen bit reversal table length
@param[in] pBitRevTab points to bit reversal table
@return none
*/
__STATIC_INLINE void arm_bitreversal_32_inpl_mve(
uint32_t *pSrc,
const uint16_t bitRevLen,
const uint16_t *pBitRevTab)
{
uint64_t *src = (uint64_t *) pSrc;
int32_t blkCnt; /* loop counters */
uint32x4_t bitRevTabOff;
uint32x4_t one = vdupq_n_u32(1);
uint64x2_t inLow, inHigh;
uint64x2_t bitRevOff1Low, bitRevOff0Low;
uint64x2_t bitRevOff1High, bitRevOff0High;
/* load scheduling to increase gather load idx update / gather load distance */
bitRevTabOff = vldrhq_u32(pBitRevTab);
pBitRevTab += 4;
bitRevOff0Low = vmullbq_int_u32(bitRevTabOff, one);
bitRevOff0High = vmulltq_int_u32(bitRevTabOff, one);
blkCnt = bitRevLen / 8;
while (blkCnt > 0) {
bitRevTabOff = vldrhq_u32(pBitRevTab);
pBitRevTab += 4;
/* 64-bit index expansion */
bitRevOff1Low = vmullbq_int_u32(bitRevTabOff, one);
bitRevOff1High = vmulltq_int_u32(bitRevTabOff, one);
inLow = vldrdq_gather_offset_u64(src, bitRevOff0Low);
inHigh = vldrdq_gather_offset_u64(src, bitRevOff0High);
vstrdq_scatter_offset_u64(src, bitRevOff0Low, inHigh);
vstrdq_scatter_offset_u64(src, bitRevOff0High, inLow);
/* unrolled */
bitRevTabOff = vldrhq_u32(pBitRevTab);
pBitRevTab += 4;
bitRevOff0Low = vmullbq_int_u32(bitRevTabOff, one);
bitRevOff0High = vmulltq_int_u32(bitRevTabOff, one);
inLow = vldrdq_gather_offset_u64(src, bitRevOff1Low);
inHigh = vldrdq_gather_offset_u64(src, bitRevOff1High);
vstrdq_scatter_offset_u64(src, bitRevOff1Low, inHigh);
vstrdq_scatter_offset_u64(src, bitRevOff1High, inLow);
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
if (bitRevLen & 7) {
/* FFT size = 16 */
inLow = vldrdq_gather_offset_u64(src, bitRevOff0Low);
inHigh = vldrdq_gather_offset_u64(src, bitRevOff0High);
vstrdq_scatter_offset_u64(src, bitRevOff0Low, inHigh);
vstrdq_scatter_offset_u64(src, bitRevOff0High, inLow);
}
}
/**
@brief In-place 16 bit reversal function for helium
@param[in,out] pSrc points to in-place buffer of unknown 16-bit data type
@param[in] bitRevLen bit reversal table length
@param[in] pBitRevTab points to bit reversal table
@return none
*/
__STATIC_INLINE void arm_bitreversal_16_inpl_mve(
uint16_t *pSrc,
const uint16_t bitRevLen,
const uint16_t *pBitRevTab)
{
uint32_t *src = (uint32_t *) pSrc;
int32_t blkCnt; /* loop counters */
uint32x4_t bitRevTabOff;
uint16x8_t one = vdupq_n_u16(1);
uint32x4_t bitRevOff1Low, bitRevOff0Low;
uint32x4_t bitRevOff1High, bitRevOff0High;
uint32x4_t inLow, inHigh;
/* load scheduling to increase gather load idx update / gather load distance */
bitRevTabOff = vldrhq_u16(pBitRevTab);
pBitRevTab += 8;
bitRevOff0Low = vmullbq_int_u16((uint16x8_t)bitRevTabOff, one);
bitRevOff0High = vmulltq_int_u16((uint16x8_t)bitRevTabOff, one);
bitRevOff0Low = vshrq_n_u16((uint16x8_t)bitRevOff0Low, 3);
bitRevOff0High = vshrq_n_u16((uint16x8_t)bitRevOff0High, 3);
blkCnt = (bitRevLen / 16);
while (blkCnt > 0) {
bitRevTabOff = vldrhq_u16(pBitRevTab);
pBitRevTab += 8;
bitRevOff1Low = vmullbq_int_u16((uint16x8_t)bitRevTabOff, one);
bitRevOff1High = vmulltq_int_u16((uint16x8_t)bitRevTabOff, one);
bitRevOff1Low = vshrq_n_u16((uint16x8_t)bitRevOff1Low, 3);
bitRevOff1High = vshrq_n_u16((uint16x8_t)bitRevOff1High, 3);
inLow = vldrwq_gather_shifted_offset_u32(src, bitRevOff0Low);
inHigh = vldrwq_gather_shifted_offset_u32(src, bitRevOff0High);
vstrwq_scatter_shifted_offset_u32(src, bitRevOff0Low, inHigh);
vstrwq_scatter_shifted_offset_u32(src, bitRevOff0High, inLow);
/* loop unrolling */
bitRevTabOff = vldrhq_u16(pBitRevTab);
pBitRevTab += 8;
bitRevOff0Low = vmullbq_int_u16((uint16x8_t)bitRevTabOff, one);
bitRevOff0High = vmulltq_int_u16((uint16x8_t)bitRevTabOff, one);
bitRevOff0Low = vshrq_n_u16((uint16x8_t)bitRevOff0Low, 3);
bitRevOff0High = vshrq_n_u16((uint16x8_t)bitRevOff0High, 3);
inLow = vldrwq_gather_shifted_offset_u32(src, bitRevOff1Low);
inHigh = vldrwq_gather_shifted_offset_u32(src, bitRevOff1High);
vstrwq_scatter_shifted_offset_u32(src, bitRevOff1Low, inHigh);
vstrwq_scatter_shifted_offset_u32(src, bitRevOff1High, inLow);
blkCnt--;
}
/* tail handling */
blkCnt = bitRevLen & 0xf;
if (blkCnt == 8) {
inLow = vldrwq_gather_shifted_offset_u32(src, bitRevOff0Low);
inHigh = vldrwq_gather_shifted_offset_u32(src, bitRevOff0High);
vstrwq_scatter_shifted_offset_u32(src, bitRevOff0Low, inHigh);
vstrwq_scatter_shifted_offset_u32(src, bitRevOff0High, inLow);
} else if (blkCnt == 12) {
/* FFT 16 special case */
mve_pred16_t p = vctp16q(4);
bitRevTabOff = vldrhq_z_u16(pBitRevTab, p);
inLow = vldrwq_gather_shifted_offset_u32(src, bitRevOff0Low);
inHigh = vldrwq_gather_shifted_offset_u32(src, bitRevOff0High);
vstrwq_scatter_shifted_offset_u32(src, bitRevOff0Low, inHigh);
vstrwq_scatter_shifted_offset_u32(src, bitRevOff0High, inLow);
bitRevOff0Low = vmullbq_int_u16((uint16x8_t)bitRevTabOff, one);
bitRevOff0High = vmulltq_int_u16((uint16x8_t)bitRevTabOff, one);
bitRevOff0Low = vshrq_n_u16((uint16x8_t)bitRevOff0Low, 3);
bitRevOff0High = vshrq_n_u16((uint16x8_t)bitRevOff0High, 3);
inLow = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff0Low, p);
inHigh = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff0High, p);
vstrwq_scatter_shifted_offset_p_u32(src, bitRevOff0Low, inHigh, p);
vstrwq_scatter_shifted_offset_p_u32(src, bitRevOff0High, inLow, p);
}
}
/**
@brief Out-of-place 32 bit reversal function for helium
@param[out] pDst points to destination buffer of unknown 32-bit data type
@param[in] pSrc points to input buffer of unknown 32-bit data type
@param[in] fftLen FFT length
@return none
*/
__STATIC_INLINE void arm_bitreversal_32_outpl_mve(void *pDst, void *pSrc, uint32_t fftLen)
{
uint32x4_t idxOffs0, idxOffs1, bitRevOffs0, bitRevOffs1;
uint32_t bitRevPos, blkCnt;
uint32_t *pDst32 = (uint32_t *) pDst;
/* fwd indexes */
idxOffs0 = vdupq_n_u32(0);
idxOffs1 = vdupq_n_u32(0);
idxOffs0[0] = 0; idxOffs0[2] = 4;
idxOffs1[0] = 8; idxOffs1[2] = 12;
bitRevPos = (31 - __CLZ(fftLen)) + 5;
blkCnt = fftLen >> 2;
/* issued earlier to increase gather load idx update / gather load distance */
/* bit-reverse fwd indexes */
bitRevOffs0 = vbrsrq(idxOffs0, bitRevPos);
bitRevOffs1 = vbrsrq(idxOffs1, bitRevPos);
while (blkCnt > 0) {
uint64x2_t vecIn;
vecIn = vldrdq_gather_offset_u64(pSrc, (uint64x2_t) bitRevOffs0);
idxOffs0 = idxOffs0 + 16;
vst1q(pDst32, (uint32x4_t) vecIn);
pDst32 += 4;
bitRevOffs0 = vbrsrq(idxOffs0, bitRevPos);
vecIn = vldrdq_gather_offset_u64(pSrc, (uint64x2_t) bitRevOffs1);
idxOffs1 = idxOffs1 + 16;
vst1q(pDst32, (uint32x4_t) vecIn);
pDst32 += 4;
bitRevOffs1 = vbrsrq(idxOffs1, bitRevPos);
blkCnt--;
}
}
/**
@brief Out-of-place 16 bit reversal function for helium
@param[out] pDst points to destination buffer of unknown 16-bit data type
@param[in] pSrc points to input buffer of unknown 16-bit data type
@param[in] fftLen FFT length
@return none
*/
__STATIC_INLINE void arm_bitreversal_16_outpl_mve(void *pDst, void *pSrc, uint32_t fftLen)
{
uint32x4_t idxOffs0, idxOffs1, bitRevOffs0, bitRevOffs1;
uint32_t bitRevPos, blkCnt;
uint16_t *pDst16 = (uint16_t *) pDst;
uint32_t incrIdx = 0;
/* fwd indexes */
idxOffs0 = vidupq_wb_u32(&incrIdx, 4); // {0, 4, 8, 12}
idxOffs1 = vidupq_wb_u32(&incrIdx, 4); // {16, 20, 24, 28}
bitRevPos = (31 - __CLZ(fftLen)) + 4;
blkCnt = fftLen >> 3;
/* issued earlier to increase gather load idx update / gather load distance */
/* bit-reverse fwd indexes */
bitRevOffs0 = vbrsrq(idxOffs0, bitRevPos);
bitRevOffs1 = vbrsrq(idxOffs1, bitRevPos);
while (blkCnt > 0) {
uint32x4_t vecIn;
vecIn = vldrwq_gather_offset_s32(pSrc, bitRevOffs0);
idxOffs0 = idxOffs0 + 32;
vst1q(pDst16, (uint16x8_t) vecIn);
pDst16 += 8;
bitRevOffs0 = vbrsrq(idxOffs0, bitRevPos);
vecIn = vldrwq_gather_offset_s32(pSrc, bitRevOffs1);
idxOffs1 = idxOffs1 + 32;
vst1q(pDst16, (uint16x8_t) vecIn);
pDst16 += 8;
bitRevOffs1 = vbrsrq(idxOffs1, bitRevPos);
blkCnt--;
}
}
#endif /* (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)*/
#ifdef __cplusplus
}
#endif
#endif /* _ARM_VEC_FFT_H_ */

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,87 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: BasicMathFunctions.c
* Description: Combination of all basic math function source files.
*
* $Date: 16. March 2020
* $Revision: V1.1.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2019-2020 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_abs_f32.c"
#include "arm_abs_f64.c"
#include "arm_abs_q15.c"
#include "arm_abs_q31.c"
#include "arm_abs_q7.c"
#include "arm_add_f32.c"
#include "arm_add_f64.c"
#include "arm_add_q15.c"
#include "arm_add_q31.c"
#include "arm_add_q7.c"
#include "arm_and_u16.c"
#include "arm_and_u32.c"
#include "arm_and_u8.c"
#include "arm_dot_prod_f32.c"
#include "arm_dot_prod_f64.c"
#include "arm_dot_prod_q15.c"
#include "arm_dot_prod_q31.c"
#include "arm_dot_prod_q7.c"
#include "arm_mult_f32.c"
#include "arm_mult_f64.c"
#include "arm_mult_q15.c"
#include "arm_mult_q31.c"
#include "arm_mult_q7.c"
#include "arm_negate_f32.c"
#include "arm_negate_f64.c"
#include "arm_negate_q15.c"
#include "arm_negate_q31.c"
#include "arm_negate_q7.c"
#include "arm_not_u16.c"
#include "arm_not_u32.c"
#include "arm_not_u8.c"
#include "arm_offset_f32.c"
#include "arm_offset_f64.c"
#include "arm_offset_q15.c"
#include "arm_offset_q31.c"
#include "arm_offset_q7.c"
#include "arm_or_u16.c"
#include "arm_or_u32.c"
#include "arm_or_u8.c"
#include "arm_scale_f32.c"
#include "arm_scale_f64.c"
#include "arm_scale_q15.c"
#include "arm_scale_q31.c"
#include "arm_scale_q7.c"
#include "arm_shift_q15.c"
#include "arm_shift_q31.c"
#include "arm_shift_q7.c"
#include "arm_sub_f32.c"
#include "arm_sub_f64.c"
#include "arm_sub_q15.c"
#include "arm_sub_q31.c"
#include "arm_sub_q7.c"
#include "arm_xor_u16.c"
#include "arm_xor_u32.c"
#include "arm_xor_u8.c"
#include "arm_clip_f32.c"
#include "arm_clip_q31.c"
#include "arm_clip_q15.c"
#include "arm_clip_q7.c"

View File

@@ -0,0 +1,37 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: BasicMathFunctionsF16.c
* Description: Combination of all basic math function f16 source files.
*
* $Date: 20. April 2020
* $Revision: V1.1.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2019-2020 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_abs_f16.c"
#include "arm_add_f16.c"
#include "arm_dot_prod_f16.c"
#include "arm_mult_f16.c"
#include "arm_negate_f16.c"
#include "arm_offset_f16.c"
#include "arm_scale_f16.c"
#include "arm_sub_f16.c"
#include "arm_clip_f16.c"

View File

@@ -0,0 +1,198 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_abs_f16.c
* Description: Floating-point vector absolute value
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions_f16.h"
#include <math.h>
/**
@ingroup groupMath
*/
/**
@defgroup BasicAbs Vector Absolute Value
Computes the absolute value of a vector on an element-by-element basis.
<pre>
pDst[n] = abs(pSrc[n]), 0 <= n < blockSize.
</pre>
The functions support in-place computation allowing the source and
destination pointers to reference the same memory buffer.
There are separate functions for floating-point, Q7, Q15, and Q31 data types.
*/
/**
@addtogroup BasicAbs
@{
*/
/**
@brief Floating-point vector absolute value.
@param[in] pSrc points to the input vector
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_abs_f16(
const float16_t * pSrc,
float16_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
f16x8_t vec1;
f16x8_t res;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 3U;
while (blkCnt > 0U)
{
/* C = |A| */
/* Calculate absolute values and then store the results in the destination buffer. */
vec1 = vld1q(pSrc);
res = vabsq(vec1);
vst1q(pDst, res);
/* Increment pointers */
pSrc += 8;
pDst += 8;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0x7;
if (blkCnt > 0U)
{
/* C = |A| */
mve_pred16_t p0 = vctp16q(blkCnt);
vec1 = vld1q(pSrc);
vstrhq_p(pDst, vabsq(vec1), p0);
}
}
#else
#if defined(ARM_FLOAT16_SUPPORTED)
void arm_abs_f16(
const float16_t * pSrc,
float16_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined(ARM_MATH_NEON_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
f16x8_t vec1;
f16x8_t res;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = |A| */
/* Calculate absolute values and then store the results in the destination buffer. */
vec1 = vld1q_f16(pSrc);
res = vabsq_f16(vec1);
vst1q_f16(pDst, res);
/* Increment pointers */
pSrc += 4;
pDst += 4;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0x3;
#else
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = |A| */
/* Calculate absolute and store result in destination buffer. */
*pDst++ = (_Float16)fabsf((float32_t)*pSrc++);
*pDst++ = (_Float16)fabsf((float32_t)*pSrc++);
*pDst++ = (_Float16)fabsf((float32_t)*pSrc++);
*pDst++ = (_Float16)fabsf((float32_t)*pSrc++);
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
#endif /* #if defined(ARM_MATH_NEON) */
while (blkCnt > 0U)
{
/* C = |A| */
/* Calculate absolute and store result in destination buffer. */
*pDst++ = (_Float16)fabsf((float32_t)*pSrc++);
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_FLOAT16_SUPPORTED */
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of BasicAbs group
*/

View File

@@ -0,0 +1,196 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_abs_f32.c
* Description: Floating-point vector absolute value
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
#include <math.h>
/**
@ingroup groupMath
*/
/**
@defgroup BasicAbs Vector Absolute Value
Computes the absolute value of a vector on an element-by-element basis.
<pre>
pDst[n] = abs(pSrc[n]), 0 <= n < blockSize.
</pre>
The functions support in-place computation allowing the source and
destination pointers to reference the same memory buffer.
There are separate functions for floating-point, Q7, Q15, and Q31 data types.
*/
/**
@addtogroup BasicAbs
@{
*/
/**
@brief Floating-point vector absolute value.
@param[in] pSrc points to the input vector
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_abs_f32(
const float32_t * pSrc,
float32_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
f32x4_t vec1;
f32x4_t res;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = |A| */
/* Calculate absolute values and then store the results in the destination buffer. */
vec1 = vld1q(pSrc);
res = vabsq(vec1);
vst1q(pDst, res);
/* Increment pointers */
pSrc += 4;
pDst += 4;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0x3;
if (blkCnt > 0U)
{
/* C = |A| */
mve_pred16_t p0 = vctp32q(blkCnt);
vec1 = vld1q(pSrc);
vstrwq_p(pDst, vabsq(vec1), p0);
}
}
#else
void arm_abs_f32(
const float32_t * pSrc,
float32_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
f32x4_t vec1;
f32x4_t res;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = |A| */
/* Calculate absolute values and then store the results in the destination buffer. */
vec1 = vld1q_f32(pSrc);
res = vabsq_f32(vec1);
vst1q_f32(pDst, res);
/* Increment pointers */
pSrc += 4;
pDst += 4;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0x3;
#else
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = |A| */
/* Calculate absolute and store result in destination buffer. */
*pDst++ = fabsf(*pSrc++);
*pDst++ = fabsf(*pSrc++);
*pDst++ = fabsf(*pSrc++);
*pDst++ = fabsf(*pSrc++);
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
#endif /* #if defined(ARM_MATH_NEON) */
while (blkCnt > 0U)
{
/* C = |A| */
/* Calculate absolute and store result in destination buffer. */
*pDst++ = fabsf(*pSrc++);
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of BasicAbs group
*/

View File

@@ -0,0 +1,74 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_abs_f64.c
* Description: Floating-point vector absolute value
*
* $Date: 13 September 2021
* $Revision: V1.10.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
#include <math.h>
/**
@ingroup groupMath
*/
/**
@addtogroup BasicAbs
@{
*/
/**
@brief Floating-point vector absolute value.
@param[in] pSrc points to the input vector
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
*/
void arm_abs_f64(
const float64_t * pSrc,
float64_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* C = |A| */
/* Calculate absolute and store result in destination buffer. */
*pDst++ = fabs(*pSrc++);
/* Decrement loop counter */
blkCnt--;
}
}
/**
@} end of BasicAbs group
*/

View File

@@ -0,0 +1,178 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_abs_q15.c
* Description: Q15 vector absolute value
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup BasicAbs
@{
*/
/**
@brief Q15 vector absolute value.
@param[in] pSrc points to the input vector
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
The Q15 value -1 (0x8000) will be saturated to the maximum allowable positive value 0x7FFF.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_abs_q15(
const q15_t * pSrc,
q15_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counters */
q15x8_t vecSrc;
/* Compute 8 outputs at a time */
blkCnt = blockSize >> 3;
while (blkCnt > 0U)
{
/*
* C = |A|
* Calculate absolute and then store the results in the destination buffer.
*/
vecSrc = vld1q(pSrc);
vst1q(pDst, vqabsq(vecSrc));
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
/*
* advance vector source and destination pointers
*/
pSrc += 8;
pDst += 8;
}
/*
* tail
*/
blkCnt = blockSize & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecSrc = vld1q(pSrc);
vstrhq_p(pDst, vqabsq(vecSrc), p0);
}
}
#else
void arm_abs_q15(
const q15_t * pSrc,
q15_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
q15_t in; /* Temporary input variable */
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = |A| */
/* Calculate absolute of input (if -1 then saturated to 0x7fff) and store result in destination buffer. */
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = (in > 0) ? in : (q15_t)__QSUB16(0, in);
#else
*pDst++ = (in > 0) ? in : ((in == (q15_t) 0x8000) ? 0x7fff : -in);
#endif
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = (in > 0) ? in : (q15_t)__QSUB16(0, in);
#else
*pDst++ = (in > 0) ? in : ((in == (q15_t) 0x8000) ? 0x7fff : -in);
#endif
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = (in > 0) ? in : (q15_t)__QSUB16(0, in);
#else
*pDst++ = (in > 0) ? in : ((in == (q15_t) 0x8000) ? 0x7fff : -in);
#endif
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = (in > 0) ? in : (q15_t)__QSUB16(0, in);
#else
*pDst++ = (in > 0) ? in : ((in == (q15_t) 0x8000) ? 0x7fff : -in);
#endif
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = |A| */
/* Calculate absolute of input (if -1 then saturated to 0x7fff) and store result in destination buffer. */
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = (in > 0) ? in : (q15_t)__QSUB16(0, in);
#else
*pDst++ = (in > 0) ? in : ((in == (q15_t) 0x8000) ? 0x7fff : -in);
#endif
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of BasicAbs group
*/

View File

@@ -0,0 +1,208 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_abs_q31.c
* Description: Q31 vector absolute value
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup BasicAbs
@{
*/
/**
@brief Q31 vector absolute value.
@param[in] pSrc points to the input vector
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
The Q31 value -1 (0x80000000) will be saturated to the maximum allowable positive value 0x7FFFFFFF.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_abs_q31(
const q31_t * pSrc,
q31_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counters */
q31x4_t vecSrc;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2;
while (blkCnt > 0U)
{
/*
* C = |A|
* Calculate absolute and then store the results in the destination buffer.
*/
vecSrc = vld1q(pSrc);
vst1q(pDst, vqabsq(vecSrc));
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
/*
* Advance vector source and destination pointers
*/
pSrc += 4;
pDst += 4;
}
/*
* Tail
*/
blkCnt = blockSize & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
vecSrc = vld1q(pSrc);
vstrwq_p(pDst, vqabsq(vecSrc), p0);
}
}
#else
void arm_abs_q31(
const q31_t * pSrc,
q31_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
q31_t in; /* Temporary variable */
#if defined(ARM_MATH_NEON)
int32x4_t vec1;
int32x4_t res;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = |A| */
/* Calculate absolute and then store the results in the destination buffer. */
vec1 = vld1q_s32(pSrc);
res = vqabsq_s32(vec1);
vst1q_s32(pDst, res);
/* Increment pointers */
pSrc += 4;
pDst += 4;
/* Decrement the blockSize loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0x3;
#else
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = |A| */
/* Calculate absolute of input (if -1 then saturated to 0x7fffffff) and store result in destination buffer. */
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = (in > 0) ? in : (q31_t)__QSUB(0, in);
#else
*pDst++ = (in > 0) ? in : ((in == INT32_MIN) ? INT32_MAX : -in);
#endif
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = (in > 0) ? in : (q31_t)__QSUB(0, in);
#else
*pDst++ = (in > 0) ? in : ((in == INT32_MIN) ? INT32_MAX : -in);
#endif
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = (in > 0) ? in : (q31_t)__QSUB(0, in);
#else
*pDst++ = (in > 0) ? in : ((in == INT32_MIN) ? INT32_MAX : -in);
#endif
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = (in > 0) ? in : (q31_t)__QSUB(0, in);
#else
*pDst++ = (in > 0) ? in : ((in == INT32_MIN) ? INT32_MAX : -in);
#endif
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
#endif /* #if defined (ARM_MATH_NEON) */
while (blkCnt > 0U)
{
/* C = |A| */
/* Calculate absolute of input (if -1 then saturated to 0x7fffffff) and store result in destination buffer. */
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = (in > 0) ? in : (q31_t)__QSUB(0, in);
#else
*pDst++ = (in > 0) ? in : ((in == INT32_MIN) ? INT32_MAX : -in);
#endif
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* #if defined (ARM_MATH_MVEI) */
/**
@} end of BasicAbs group
*/

View File

@@ -0,0 +1,180 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_abs_q7.c
* Description: Q7 vector absolute value
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup BasicAbs
@{
*/
/**
@brief Q7 vector absolute value.
@param[in] pSrc points to the input vector
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
@par Conditions for optimum performance
Input and output buffers should be aligned by 32-bit
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
The Q7 value -1 (0x80) will be saturated to the maximum allowable positive value 0x7F.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_abs_q7(
const q7_t * pSrc,
q7_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counters */
q7x16_t vecSrc;
/* Compute 16 outputs at a time */
blkCnt = blockSize >> 4;
while (blkCnt > 0U)
{
/*
* C = |A|
* Calculate absolute and then store the results in the destination buffer.
*/
vecSrc = vld1q(pSrc);
vst1q(pDst, vqabsq(vecSrc));
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
/*
* advance vector source and destination pointers
*/
pSrc += 16;
pDst += 16;
}
/*
* tail
*/
blkCnt = blockSize & 0xF;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp8q(blkCnt);
vecSrc = vld1q(pSrc);
vstrbq_p(pDst, vqabsq(vecSrc), p0);
}
}
#else
void arm_abs_q7(
const q7_t * pSrc,
q7_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
q7_t in; /* Temporary input variable */
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = |A| */
/* Calculate absolute of input (if -1 then saturated to 0x7f) and store result in destination buffer. */
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = (in > 0) ? in : (q7_t)__QSUB8(0, in);
#else
*pDst++ = (in > 0) ? in : ((in == (q7_t) 0x80) ? (q7_t) 0x7f : -in);
#endif
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = (in > 0) ? in : (q7_t)__QSUB8(0, in);
#else
*pDst++ = (in > 0) ? in : ((in == (q7_t) 0x80) ? (q7_t) 0x7f : -in);
#endif
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = (in > 0) ? in : (q7_t)__QSUB8(0, in);
#else
*pDst++ = (in > 0) ? in : ((in == (q7_t) 0x80) ? (q7_t) 0x7f : -in);
#endif
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = (in > 0) ? in : (q7_t)__QSUB8(0, in);
#else
*pDst++ = (in > 0) ? in : ((in == (q7_t) 0x80) ? (q7_t) 0x7f : -in);
#endif
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = |A| */
/* Calculate absolute of input (if -1 then saturated to 0x7f) and store result in destination buffer. */
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = (in > 0) ? in : (q7_t) __QSUB8(0, in);
#else
*pDst++ = (in > 0) ? in : ((in == (q7_t) 0x80) ? (q7_t) 0x7f : -in);
#endif
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of BasicAbs group
*/

View File

@@ -0,0 +1,169 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_add_f16.c
* Description: Floating-point vector addition
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions_f16.h"
/**
@ingroup groupMath
*/
/**
@defgroup BasicAdd Vector Addition
Element-by-element addition of two vectors.
<pre>
pDst[n] = pSrcA[n] + pSrcB[n], 0 <= n < blockSize.
</pre>
There are separate functions for floating-point, Q7, Q15, and Q31 data types.
*/
/**
@addtogroup BasicAdd
@{
*/
/**
@brief Floating-point vector addition.
@param[in] pSrcA points to first input vector
@param[in] pSrcB points to second input vector
@param[out] pDst points to output vector
@param[in] blockSize number of samples in each vector
@return none
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_add_f16(
const float16_t * pSrcA,
const float16_t * pSrcB,
float16_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
f16x8_t vec1;
f16x8_t vec2;
f16x8_t res;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 3U;
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and then store the results in the destination buffer. */
vec1 = vld1q(pSrcA);
vec2 = vld1q(pSrcB);
res = vaddq(vec1, vec2);
vst1q(pDst, res);
/* Increment pointers */
pSrcA += 8;
pSrcB += 8;
pDst += 8;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0x7;
if (blkCnt > 0U)
{
/* C = A + B */
mve_pred16_t p0 = vctp16q(blkCnt);
vec1 = vld1q(pSrcA);
vec2 = vld1q(pSrcB);
vstrhq_p(pDst, vaddq(vec1,vec2), p0);
}
}
#else
#if defined(ARM_FLOAT16_SUPPORTED)
void arm_add_f16(
const float16_t * pSrcA,
const float16_t * pSrcB,
float16_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and store result in destination buffer. */
*pDst++ = (_Float16)(*pSrcA++) + (_Float16)(*pSrcB++);
*pDst++ = (_Float16)(*pSrcA++) + (_Float16)(*pSrcB++);
*pDst++ = (_Float16)(*pSrcA++) + (_Float16)(*pSrcB++);
*pDst++ = (_Float16)(*pSrcA++) + (_Float16)(*pSrcB++);
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and store result in destination buffer. */
*pDst++ = (_Float16)(*pSrcA++) + (_Float16)(*pSrcB++);
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_FLOAT16_SUPPORTED) */
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of BasicAdd group
*/

View File

@@ -0,0 +1,199 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_add_f32.c
* Description: Floating-point vector addition
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@defgroup BasicAdd Vector Addition
Element-by-element addition of two vectors.
<pre>
pDst[n] = pSrcA[n] + pSrcB[n], 0 <= n < blockSize.
</pre>
There are separate functions for floating-point, Q7, Q15, and Q31 data types.
*/
/**
@addtogroup BasicAdd
@{
*/
/**
@brief Floating-point vector addition.
@param[in] pSrcA points to first input vector
@param[in] pSrcB points to second input vector
@param[out] pDst points to output vector
@param[in] blockSize number of samples in each vector
@return none
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_add_f32(
const float32_t * pSrcA,
const float32_t * pSrcB,
float32_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
f32x4_t vec1;
f32x4_t vec2;
f32x4_t res;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and then store the results in the destination buffer. */
vec1 = vld1q(pSrcA);
vec2 = vld1q(pSrcB);
res = vaddq(vec1, vec2);
vst1q(pDst, res);
/* Increment pointers */
pSrcA += 4;
pSrcB += 4;
pDst += 4;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0x3;
if (blkCnt > 0U)
{
/* C = A + B */
mve_pred16_t p0 = vctp32q(blkCnt);
vec1 = vld1q(pSrcA);
vec2 = vld1q(pSrcB);
vstrwq_p(pDst, vaddq(vec1,vec2), p0);
}
}
#else
void arm_add_f32(
const float32_t * pSrcA,
const float32_t * pSrcB,
float32_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
f32x4_t vec1;
f32x4_t vec2;
f32x4_t res;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and then store the results in the destination buffer. */
vec1 = vld1q_f32(pSrcA);
vec2 = vld1q_f32(pSrcB);
res = vaddq_f32(vec1, vec2);
vst1q_f32(pDst, res);
/* Increment pointers */
pSrcA += 4;
pSrcB += 4;
pDst += 4;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0x3;
#else
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and store result in destination buffer. */
*pDst++ = (*pSrcA++) + (*pSrcB++);
*pDst++ = (*pSrcA++) + (*pSrcB++);
*pDst++ = (*pSrcA++) + (*pSrcB++);
*pDst++ = (*pSrcA++) + (*pSrcB++);
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
#endif /* #if defined(ARM_MATH_NEON) */
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and store result in destination buffer. */
*pDst++ = (*pSrcA++) + (*pSrcB++);
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of BasicAdd group
*/

View File

@@ -0,0 +1,75 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_add_f64.c
* Description: Floating-point vector addition
*
* $Date: 13 September 2021
* $Revision: V1.10.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup BasicAdd
@{
*/
/**
@brief Floating-point vector addition.
@param[in] pSrcA points to first input vector
@param[in] pSrcB points to second input vector
@param[out] pDst points to output vector
@param[in] blockSize number of samples in each vector
@return none
*/
void arm_add_f64(
const float64_t * pSrcA,
const float64_t * pSrcB,
float64_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and store result in destination buffer. */
*pDst++ = (*pSrcA++) + (*pSrcB++);
/* Decrement loop counter */
blkCnt--;
}
}
/**
@} end of BasicAdd group
*/

View File

@@ -0,0 +1,176 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_add_q15.c
* Description: Q15 vector addition
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup BasicAdd
@{
*/
/**
@brief Q15 vector addition.
@param[in] pSrcA points to the first input vector
@param[in] pSrcB points to the second input vector
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_add_q15(
const q15_t * pSrcA,
const q15_t * pSrcB,
q15_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counters */
q15x8_t vecA;
q15x8_t vecB;
/* Compute 8 outputs at a time */
blkCnt = blockSize >> 3;
while (blkCnt > 0U)
{
/*
* C = A + B
* Add and then store the results in the destination buffer.
*/
vecA = vld1q(pSrcA);
vecB = vld1q(pSrcB);
vst1q(pDst, vqaddq(vecA, vecB));
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
/*
* advance vector source and destination pointers
*/
pSrcA += 8;
pSrcB += 8;
pDst += 8;
}
/*
* tail
*/
blkCnt = blockSize & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecA = vld1q(pSrcA);
vecB = vld1q(pSrcB);
vstrhq_p(pDst, vqaddq(vecA, vecB), p0);
}
}
#else
void arm_add_q15(
const q15_t * pSrcA,
const q15_t * pSrcB,
q15_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined (ARM_MATH_LOOPUNROLL)
#if defined (ARM_MATH_DSP)
q31_t inA1, inA2;
q31_t inB1, inB2;
#endif
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A + B */
#if defined (ARM_MATH_DSP)
/* read 2 times 2 samples at a time from sourceA */
inA1 = read_q15x2_ia (&pSrcA);
inA2 = read_q15x2_ia (&pSrcA);
/* read 2 times 2 samples at a time from sourceB */
inB1 = read_q15x2_ia (&pSrcB);
inB2 = read_q15x2_ia (&pSrcB);
/* Add and store 2 times 2 samples at a time */
write_q15x2_ia (&pDst, __QADD16(inA1, inB1));
write_q15x2_ia (&pDst, __QADD16(inA2, inB2));
#else
*pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ + *pSrcB++), 16);
*pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ + *pSrcB++), 16);
*pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ + *pSrcB++), 16);
*pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ + *pSrcB++), 16);
#endif
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and store result in destination buffer. */
#if defined (ARM_MATH_DSP)
*pDst++ = (q15_t) __QADD16(*pSrcA++, *pSrcB++);
#else
*pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ + *pSrcB++), 16);
#endif
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of BasicAdd group
*/

View File

@@ -0,0 +1,159 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_add_q31.c
* Description: Q31 vector addition
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup BasicAdd
@{
*/
/**
@brief Q31 vector addition.
@param[in] pSrcA points to the first input vector
@param[in] pSrcB points to the second input vector
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_add_q31(
const q31_t * pSrcA,
const q31_t * pSrcB,
q31_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt;
q31x4_t vecA;
q31x4_t vecB;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2;
while (blkCnt > 0U)
{
/*
* C = A + B
* Add and then store the results in the destination buffer.
*/
vecA = vld1q(pSrcA);
vecB = vld1q(pSrcB);
vst1q(pDst, vqaddq(vecA, vecB));
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
/*
* advance vector source and destination pointers
*/
pSrcA += 4;
pSrcB += 4;
pDst += 4;
}
/*
* tail
*/
blkCnt = blockSize & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
vecA = vld1q(pSrcA);
vecB = vld1q(pSrcB);
vstrwq_p(pDst, vqaddq(vecA, vecB), p0);
}
}
#else
void arm_add_q31(
const q31_t * pSrcA,
const q31_t * pSrcB,
q31_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and store result in destination buffer. */
*pDst++ = __QADD(*pSrcA++, *pSrcB++);
*pDst++ = __QADD(*pSrcA++, *pSrcB++);
*pDst++ = __QADD(*pSrcA++, *pSrcB++);
*pDst++ = __QADD(*pSrcA++, *pSrcB++);
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and store result in destination buffer. */
*pDst++ = __QADD(*pSrcA++, *pSrcB++);
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of BasicAdd group
*/

View File

@@ -0,0 +1,159 @@
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_add_q7.c
* Description: Q7 vector addition
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup BasicAdd
@{
*/
/**
@brief Q7 vector addition.
@param[in] pSrcA points to the first input vector
@param[in] pSrcB points to the second input vector
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q7 range [0x80 0x7F] are saturated.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_add_q7(
const q7_t * pSrcA,
const q7_t * pSrcB,
q7_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counters */
q7x16_t vecA;
q7x16_t vecB;
/* Compute 16 outputs at a time */
blkCnt = blockSize >> 4;
while (blkCnt > 0U)
{
/*
* C = A + B
* Add and then store the results in the destination buffer.
*/
vecA = vld1q(pSrcA);
vecB = vld1q(pSrcB);
vst1q(pDst, vqaddq(vecA, vecB));
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
/*
* advance vector source and destination pointers
*/
pSrcA += 16;
pSrcB += 16;
pDst += 16;
}
/*
* tail
*/
blkCnt = blockSize & 0xF;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp8q(blkCnt);
vecA = vld1q(pSrcA);
vecB = vld1q(pSrcB);
vstrbq_p(pDst, vqaddq(vecA, vecB), p0);
}
}
#else
void arm_add_q7(
const q7_t * pSrcA,
const q7_t * pSrcB,
q7_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A + B */
#if defined (ARM_MATH_DSP)
/* Add and store result in destination buffer (4 samples at a time). */
write_q7x4_ia (&pDst, __QADD8 (read_q7x4_ia (&pSrcA), read_q7x4_ia (&pSrcB)));
#else
*pDst++ = (q7_t) __SSAT ((q15_t) *pSrcA++ + *pSrcB++, 8);
*pDst++ = (q7_t) __SSAT ((q15_t) *pSrcA++ + *pSrcB++, 8);
*pDst++ = (q7_t) __SSAT ((q15_t) *pSrcA++ + *pSrcB++, 8);
*pDst++ = (q7_t) __SSAT ((q15_t) *pSrcA++ + *pSrcB++, 8);
#endif
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and store result in destination buffer. */
*pDst++ = (q7_t) __SSAT((q15_t) *pSrcA++ + *pSrcB++, 8);
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of BasicAdd group
*/

View File

@@ -0,0 +1,137 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_and_u16.c
* Description: uint16_t bitwise AND
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@defgroup And Vector bitwise AND
Compute the logical bitwise AND.
There are separate functions for uint32_t, uint16_t, and uint7_t data types.
*/
/**
@addtogroup And
@{
*/
/**
@brief Compute the logical bitwise AND of two fixed-point vectors.
@param[in] pSrcA points to input vector A
@param[in] pSrcB points to input vector B
@param[out] pDst points to output vector
@param[in] blockSize number of samples in each vector
@return none
*/
void arm_and_u16(
const uint16_t * pSrcA,
const uint16_t * pSrcB,
uint16_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
uint16x8_t vecSrcA, vecSrcB;
/* Compute 8 outputs at a time */
blkCnt = blockSize >> 3;
while (blkCnt > 0U)
{
vecSrcA = vld1q(pSrcA);
vecSrcB = vld1q(pSrcB);
vst1q(pDst, vandq_u16(vecSrcA, vecSrcB) );
pSrcA += 8;
pSrcB += 8;
pDst += 8;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecSrcA = vld1q(pSrcA);
vecSrcB = vld1q(pSrcB);
vstrhq_p(pDst, vandq_u16(vecSrcA, vecSrcB), p0);
}
#else
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
uint16x8_t vecA, vecB;
/* Compute 8 outputs at a time */
blkCnt = blockSize >> 3U;
while (blkCnt > 0U)
{
vecA = vld1q_u16(pSrcA);
vecB = vld1q_u16(pSrcB);
vst1q_u16(pDst, vandq_u16(vecA, vecB) );
pSrcA += 8;
pSrcB += 8;
pDst += 8;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 7;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif
while (blkCnt > 0U)
{
*pDst++ = (*pSrcA++)&(*pSrcB++);
/* Decrement the loop counter */
blkCnt--;
}
#endif /* if defined(ARM_MATH_MVEI) */
}
/**
@} end of And group
*/

View File

@@ -0,0 +1,129 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_and_u32.c
* Description: uint32_t bitwise AND
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup And
@{
*/
/**
@brief Compute the logical bitwise AND of two fixed-point vectors.
@param[in] pSrcA points to input vector A
@param[in] pSrcB points to input vector B
@param[out] pDst points to output vector
@param[in] blockSize number of samples in each vector
@return none
*/
void arm_and_u32(
const uint32_t * pSrcA,
const uint32_t * pSrcB,
uint32_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
uint32x4_t vecSrcA, vecSrcB;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2;
while (blkCnt > 0U)
{
vecSrcA = vld1q(pSrcA);
vecSrcB = vld1q(pSrcB);
vst1q(pDst, vandq_u32(vecSrcA, vecSrcB) );
pSrcA += 4;
pSrcB += 4;
pDst += 4;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
vecSrcA = vld1q(pSrcA);
vecSrcB = vld1q(pSrcB);
vstrwq_p(pDst, vandq_u32(vecSrcA, vecSrcB), p0);
}
#else
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
uint32x4_t vecA, vecB;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
vecA = vld1q_u32(pSrcA);
vecB = vld1q_u32(pSrcB);
vst1q_u32(pDst, vandq_u32(vecA, vecB) );
pSrcA += 4;
pSrcB += 4;
pDst += 4;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 3;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif
while (blkCnt > 0U)
{
*pDst++ = (*pSrcA++)&(*pSrcB++);
/* Decrement the loop counter */
blkCnt--;
}
#endif /* if defined(ARM_MATH_MVEI) */
}
/**
@} end of And group
*/

View File

@@ -0,0 +1,130 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_and_u8.c
* Description: uint8_t bitwise AND
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup And
@{
*/
/**
@brief Compute the logical bitwise AND of two fixed-point vectors.
@param[in] pSrcA points to input vector A
@param[in] pSrcB points to input vector B
@param[out] pDst points to output vector
@param[in] blockSize number of samples in each vector
@return none
*/
void arm_and_u8(
const uint8_t * pSrcA,
const uint8_t * pSrcB,
uint8_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
uint8x16_t vecSrcA, vecSrcB;
/* Compute 16 outputs at a time */
blkCnt = blockSize >> 4;
while (blkCnt > 0U)
{
vecSrcA = vld1q(pSrcA);
vecSrcB = vld1q(pSrcB);
vst1q(pDst, vandq_u8(vecSrcA, vecSrcB) );
pSrcA += 16;
pSrcB += 16;
pDst += 16;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0xF;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp8q(blkCnt);
vecSrcA = vld1q(pSrcA);
vecSrcB = vld1q(pSrcB);
vstrbq_p(pDst, vandq_u8(vecSrcA, vecSrcB), p0);
}
#else
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
uint8x16_t vecA, vecB;
/* Compute 16 outputs at a time */
blkCnt = blockSize >> 4U;
while (blkCnt > 0U)
{
vecA = vld1q_u8(pSrcA);
vecB = vld1q_u8(pSrcB);
vst1q_u8(pDst, vandq_u8(vecA, vecB) );
pSrcA += 16;
pSrcB += 16;
pDst += 16;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0xF;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif
while (blkCnt > 0U)
{
*pDst++ = (*pSrcA++)&(*pSrcB++);
/* Decrement the loop counter */
blkCnt--;
}
#endif /* if defined(ARM_MATH_MVEI) */
}
/**
@} end of And group
*/

View File

@@ -0,0 +1,141 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_clip_f16.c
* Description: Floating-point vector addition
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions_f16.h"
/**
@ingroup groupMath
*/
/**
@addtogroup BasicClip
@{
*/
/**
@brief Elementwise floating-point clipping
@param[in] pSrc points to input values
@param[out] pDst points to output clipped values
@param[in] low lower bound
@param[in] high higher bound
@param[in] numSamples number of samples to clip
@return none
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_clip_f16(const float16_t * pSrc,
float16_t * pDst,
float16_t low,
float16_t high,
uint32_t numSamples)
{
uint32_t blkCnt;
f16x8_t curVec0, curVec1;
f16x8_t vecLow, vecHigh;
vecLow = vdupq_n_f16(low);
vecHigh = vdupq_n_f16(high);
curVec0 = vld1q(pSrc);
pSrc += 8;
/*
* unrolled x 2 to allow
* vldr/vstr/vmin/vmax
* stall free interleaving
*/
blkCnt = numSamples >> 4;
while (blkCnt--)
{
curVec0 = vmaxnmq(curVec0, vecLow);
curVec1 = vld1q(pSrc);
pSrc += 8;
curVec0 = vminnmq(curVec0, vecHigh);
vst1q(pDst, curVec0);
pDst += 8;
curVec1 = vmaxnmq(curVec1, vecLow);
curVec0 = vld1q(pSrc);
pSrc += 8;
curVec1 = vminnmq(curVec1, vecHigh);
vst1q(pDst, curVec1);
pDst += 8;
}
/*
* Tail handling
*/
blkCnt = numSamples - ((numSamples >> 4) << 4);
if (blkCnt >= 8)
{
curVec0 = vmaxnmq(curVec0, vecLow);
curVec0 = vminnmq(curVec0, vecHigh);
vst1q(pDst, curVec0);
pDst += 8;
curVec0 = vld1q(pSrc);
pSrc += 8;
}
if (blkCnt > 0)
{
mve_pred16_t p0 = vctp16q(blkCnt & 7);
curVec0 = vmaxnmq(curVec0, vecLow);
curVec0 = vminnmq(curVec0, vecHigh);
vstrhq_p(pDst, curVec0, p0);
}
}
#else
#if defined(ARM_FLOAT16_SUPPORTED)
void arm_clip_f16(const float16_t * pSrc,
float16_t * pDst,
float16_t low,
float16_t high,
uint32_t numSamples)
{
for (uint32_t i = 0; i < numSamples; i++)
{
if ((_Float16)pSrc[i] > (_Float16)high)
pDst[i] = high;
else if ((_Float16)pSrc[i] < (_Float16)low)
pDst[i] = low;
else
pDst[i] = pSrc[i];
}
}
#endif /* defined(ARM_FLOAT16_SUPPORTED */
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of BasicClip group
*/

View File

@@ -0,0 +1,144 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_clip_f32.c
* Description: Floating-point vector addition
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@defgroup BasicClip Elementwise clipping
Element-by-element clipping of a value.
The value is constrained between 2 bounds.
There are separate functions for floating-point, Q7, Q15, and Q31 data types.
*/
/**
@addtogroup BasicClip
@{
*/
/**
@brief Elementwise floating-point clipping
@param[in] pSrc points to input values
@param[out] pDst points to output clipped values
@param[in] low lower bound
@param[in] high higher bound
@param[in] numSamples number of samples to clip
@return none
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_clip_f32(const float32_t * pSrc,
float32_t * pDst,
float32_t low,
float32_t high,
uint32_t numSamples)
{
uint32_t blkCnt;
f32x4_t curVec0, curVec1;
f32x4_t vecLow, vecHigh;
vecLow = vdupq_n_f32(low);
vecHigh = vdupq_n_f32(high);
curVec0 = vld1q(pSrc);
pSrc += 4;
/*
* unrolled x 2 to allow
* vldr/vstr/vmin/vmax
* stall free interleaving
*/
blkCnt = numSamples >> 3;
while (blkCnt--)
{
curVec0 = vmaxnmq(curVec0, vecLow);
curVec1 = vld1q(pSrc);
pSrc += 4;
curVec0 = vminnmq(curVec0, vecHigh);
vst1q(pDst, curVec0);
pDst += 4;
curVec1 = vmaxnmq(curVec1, vecLow);
curVec0 = vld1q(pSrc);
pSrc += 4;
curVec1 = vminnmq(curVec1, vecHigh);
vst1q(pDst, curVec1);
pDst += 4;
}
/*
* Tail handling
*/
blkCnt = numSamples - ((numSamples >> 3) << 3);
if (blkCnt >= 4)
{
curVec0 = vmaxnmq(curVec0, vecLow);
curVec0 = vminnmq(curVec0, vecHigh);
vst1q(pDst, curVec0);
pDst += 4;
curVec0 = vld1q(pSrc);
pSrc += 4;
}
if (blkCnt > 0)
{
mve_pred16_t p0 = vctp32q(blkCnt & 3);
curVec0 = vmaxnmq(curVec0, vecLow);
curVec0 = vminnmq(curVec0, vecHigh);
vstrwq_p(pDst, curVec0, p0);
}
}
#else
void arm_clip_f32(const float32_t * pSrc,
float32_t * pDst,
float32_t low,
float32_t high,
uint32_t numSamples)
{
uint32_t i;
for (i = 0; i < numSamples; i++)
{
if (pSrc[i] > high)
pDst[i] = high;
else if (pSrc[i] < low)
pDst[i] = low;
else
pDst[i] = pSrc[i];
}
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of BasicClip group
*/

View File

@@ -0,0 +1,134 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_clip_q15.c
* Description: Floating-point vector addition
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup BasicClip
@{
*/
/**
@brief Elementwise fixed-point clipping
@param[in] pSrc points to input values
@param[out] pDst points to output clipped values
@param[in] low lower bound
@param[in] high higher bound
@param[in] numSamples number of samples to clip
@return none
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_clip_q15(const q15_t * pSrc,
q15_t * pDst,
q15_t low,
q15_t high,
uint32_t numSamples)
{
uint32_t blkCnt;
q15x8_t curVec0, curVec1;
q15x8_t vecLow, vecHigh;
vecLow = vdupq_n_s16(low);
vecHigh = vdupq_n_s16(high);
curVec0 = vld1q(pSrc);
pSrc += 8;
/*
* unrolled x 2 to allow
* vldr/vstr/vmin/vmax
* stall free interleaving
*/
blkCnt = numSamples >> 4;
while (blkCnt--)
{
curVec0 = vmaxq(curVec0, vecLow);
curVec1 = vld1q(pSrc);
pSrc += 8;
curVec0 = vminq(curVec0, vecHigh);
vst1q(pDst, curVec0);
pDst += 8;
curVec1 = vmaxq(curVec1, vecLow);
curVec0 = vld1q(pSrc);
pSrc += 8;
curVec1 = vminq(curVec1, vecHigh);
vst1q(pDst, curVec1);
pDst += 8;
}
/*
* Tail handling
*/
blkCnt = numSamples - ((numSamples >> 4) << 4);
if (blkCnt >= 8)
{
curVec0 = vmaxq(curVec0, vecLow);
curVec0 = vminq(curVec0, vecHigh);
vst1q(pDst, curVec0);
pDst += 8;
curVec0 = vld1q(pSrc);
pSrc += 8;
}
if (blkCnt > 0)
{
mve_pred16_t p0 = vctp16q(blkCnt & 7);
curVec0 = vmaxq(curVec0, vecLow);
curVec0 = vminq(curVec0, vecHigh);
vstrhq_p(pDst, curVec0, p0);
}
}
#else
void arm_clip_q15(const q15_t * pSrc,
q15_t * pDst,
q15_t low,
q15_t high,
uint32_t numSamples)
{
uint32_t i;
for (i = 0; i < numSamples; i++)
{
if (pSrc[i] > high)
pDst[i] = high;
else if (pSrc[i] < low)
pDst[i] = low;
else
pDst[i] = pSrc[i];
}
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of BasicClip group
*/

View File

@@ -0,0 +1,134 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_clip_q31.c
* Description: Floating-point vector addition
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup BasicClip
@{
*/
/**
@brief Elementwise fixed-point clipping
@param[in] pSrc points to input values
@param[out] pDst points to output clipped values
@param[in] low lower bound
@param[in] high higher bound
@param[in] numSamples number of samples to clip
@return none
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_clip_q31(const q31_t * pSrc,
q31_t * pDst,
q31_t low,
q31_t high,
uint32_t numSamples)
{
uint32_t blkCnt;
q31x4_t curVec0, curVec1;
q31x4_t vecLow, vecHigh;
vecLow = vdupq_n_s32(low);
vecHigh = vdupq_n_s32(high);
curVec0 = vld1q(pSrc);
pSrc += 4;
/*
* unrolled x 2 to allow
* vldr/vstr/vmin/vmax
* stall free interleaving
*/
blkCnt = numSamples >> 3;
while (blkCnt--)
{
curVec0 = vmaxq(curVec0, vecLow);
curVec1 = vld1q(pSrc);
pSrc += 4;
curVec0 = vminq(curVec0, vecHigh);
vst1q(pDst, curVec0);
pDst += 4;
curVec1 = vmaxq(curVec1, vecLow);
curVec0 = vld1q(pSrc);
pSrc += 4;
curVec1 = vminq(curVec1, vecHigh);
vst1q(pDst, curVec1);
pDst += 4;
}
/*
* Tail handling
*/
blkCnt = numSamples - ((numSamples >> 3) << 3);
if (blkCnt >= 4)
{
curVec0 = vmaxq(curVec0, vecLow);
curVec0 = vminq(curVec0, vecHigh);
vst1q(pDst, curVec0);
pDst += 4;
curVec0 = vld1q(pSrc);
pSrc += 4;
}
if (blkCnt > 0)
{
mve_pred16_t p0 = vctp32q(blkCnt & 3);
curVec0 = vmaxq(curVec0, vecLow);
curVec0 = vminq(curVec0, vecHigh);
vstrwq_p(pDst, curVec0, p0);
}
}
#else
void arm_clip_q31(const q31_t * pSrc,
q31_t * pDst,
q31_t low,
q31_t high,
uint32_t numSamples)
{
uint32_t i;
for (i = 0; i < numSamples; i++)
{
if (pSrc[i] > high)
pDst[i] = high;
else if (pSrc[i] < low)
pDst[i] = low;
else
pDst[i] = pSrc[i];
}
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of BasicClip group
*/

View File

@@ -0,0 +1,134 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_clip_q7.c
* Description: Floating-point vector addition
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup BasicClip
@{
*/
/**
@brief Elementwise fixed-point clipping
@param[in] pSrc points to input values
@param[out] pDst points to output clipped values
@param[in] low lower bound
@param[in] high higher bound
@param[in] numSamples number of samples to clip
@return none
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_clip_q7(const q7_t * pSrc,
q7_t * pDst,
q7_t low,
q7_t high,
uint32_t numSamples)
{
uint32_t blkCnt;
q7x16_t curVec0, curVec1;
q7x16_t vecLow, vecHigh;
vecLow = vdupq_n_s8(low);
vecHigh = vdupq_n_s8(high);
curVec0 = vld1q(pSrc);
pSrc += 16;
/*
* unrolled x 2 to allow
* vldr/vstr/vmin/vmax
* stall free interleaving
*/
blkCnt = numSamples >> 5;
while (blkCnt--)
{
curVec0 = vmaxq(curVec0, vecLow);
curVec1 = vld1q(pSrc);
pSrc += 16;
curVec0 = vminq(curVec0, vecHigh);
vst1q(pDst, curVec0);
pDst += 16;
curVec1 = vmaxq(curVec1, vecLow);
curVec0 = vld1q(pSrc);
pSrc += 16;
curVec1 = vminq(curVec1, vecHigh);
vst1q(pDst, curVec1);
pDst += 16;
}
/*
* Tail handling
*/
blkCnt = numSamples - ((numSamples >> 5) << 5);
if (blkCnt >= 16)
{
curVec0 = vmaxq(curVec0, vecLow);
curVec0 = vminq(curVec0, vecHigh);
vst1q(pDst, curVec0);
pDst += 16;
curVec0 = vld1q(pSrc);
pSrc += 16;
}
if (blkCnt > 0)
{
mve_pred16_t p0 = vctp8q(blkCnt & 0xf);
curVec0 = vmaxq(curVec0, vecLow);
curVec0 = vminq(curVec0, vecHigh);
vstrbq_p(pDst, curVec0, p0);
}
}
#else
void arm_clip_q7(const q7_t * pSrc,
q7_t * pDst,
q7_t low,
q7_t high,
uint32_t numSamples)
{
uint32_t i;
for (i = 0; i < numSamples; i++)
{
if (pSrc[i] > high)
pDst[i] = high;
else if (pSrc[i] < low)
pDst[i] = low;
else
pDst[i] = pSrc[i];
}
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of BasicClip group
*/

View File

@@ -0,0 +1,184 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_dot_prod_f16.c
* Description: Floating-point dot product
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions_f16.h"
/**
@ingroup groupMath
*/
/**
@defgroup BasicDotProd Vector Dot Product
Computes the dot product of two vectors.
The vectors are multiplied element-by-element and then summed.
<pre>
sum = pSrcA[0]*pSrcB[0] + pSrcA[1]*pSrcB[1] + ... + pSrcA[blockSize-1]*pSrcB[blockSize-1]
</pre>
There are separate functions for floating-point, Q7, Q15, and Q31 data types.
*/
/**
@addtogroup BasicDotProd
@{
*/
/**
@brief Dot product of floating-point vectors.
@param[in] pSrcA points to the first input vector.
@param[in] pSrcB points to the second input vector.
@param[in] blockSize number of samples in each vector.
@param[out] result output result returned here.
@return none
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_dot_prod_f16(
const float16_t * pSrcA,
const float16_t * pSrcB,
uint32_t blockSize,
float16_t * result)
{
f16x8_t vecA, vecB;
f16x8_t vecSum;
uint32_t blkCnt;
float16_t sum = 0.0f;
vecSum = vdupq_n_f16(0.0f);
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 3U;
while (blkCnt > 0U)
{
/*
* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1]
* Calculate dot product and then store the result in a temporary buffer.
* and advance vector source and destination pointers
*/
vecA = vld1q(pSrcA);
pSrcA += 8;
vecB = vld1q(pSrcB);
pSrcB += 8;
vecSum = vfmaq(vecSum, vecA, vecB);
/*
* Decrement the blockSize loop counter
*/
blkCnt --;
}
blkCnt = blockSize & 7;
if (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
mve_pred16_t p0 = vctp16q(blkCnt);
vecA = vld1q(pSrcA);
vecB = vld1q(pSrcB);
vecSum = vfmaq_m(vecSum, vecA, vecB, p0);
}
sum = vecAddAcrossF16Mve(vecSum);
/* Store result in destination buffer */
*result = sum;
}
#else
#if defined(ARM_FLOAT16_SUPPORTED)
void arm_dot_prod_f16(
const float16_t * pSrcA,
const float16_t * pSrcB,
uint32_t blockSize,
float16_t * result)
{
uint32_t blkCnt; /* Loop counter */
_Float16 sum = 0.0f; /* Temporary return variable */
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
/* Calculate dot product and store result in a temporary buffer. */
sum += (_Float16)(*pSrcA++) * (_Float16)(*pSrcB++);
sum += (_Float16)(*pSrcA++) * (_Float16)(*pSrcB++);
sum += (_Float16)(*pSrcA++) * (_Float16)(*pSrcB++);
sum += (_Float16)(*pSrcA++) * (_Float16)(*pSrcB++);
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
/* Calculate dot product and store result in a temporary buffer. */
sum += (_Float16)(*pSrcA++) * (_Float16)(*pSrcB++);
/* Decrement loop counter */
blkCnt--;
}
/* Store result in destination buffer */
*result = sum;
}
#endif
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of BasicDotProd group
*/

View File

@@ -0,0 +1,228 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_dot_prod_f32.c
* Description: Floating-point dot product
*
* $Date: 05 October 2021
* $Revision: V1.9.1
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@defgroup BasicDotProd Vector Dot Product
Computes the dot product of two vectors.
The vectors are multiplied element-by-element and then summed.
<pre>
sum = pSrcA[0]*pSrcB[0] + pSrcA[1]*pSrcB[1] + ... + pSrcA[blockSize-1]*pSrcB[blockSize-1]
</pre>
There are separate functions for floating-point, Q7, Q15, and Q31 data types.
*/
/**
@addtogroup BasicDotProd
@{
*/
/**
@brief Dot product of floating-point vectors.
@param[in] pSrcA points to the first input vector.
@param[in] pSrcB points to the second input vector.
@param[in] blockSize number of samples in each vector.
@param[out] result output result returned here.
@return none
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_dot_prod_f32(
const float32_t * pSrcA,
const float32_t * pSrcB,
uint32_t blockSize,
float32_t * result)
{
f32x4_t vecA, vecB;
f32x4_t vecSum;
uint32_t blkCnt;
float32_t sum = 0.0f;
vecSum = vdupq_n_f32(0.0f);
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/*
* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1]
* Calculate dot product and then store the result in a temporary buffer.
* and advance vector source and destination pointers
*/
vecA = vld1q(pSrcA);
pSrcA += 4;
vecB = vld1q(pSrcB);
pSrcB += 4;
vecSum = vfmaq(vecSum, vecA, vecB);
/*
* Decrement the blockSize loop counter
*/
blkCnt --;
}
blkCnt = blockSize & 3;
if (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
mve_pred16_t p0 = vctp32q(blkCnt);
vecA = vld1q(pSrcA);
vecB = vld1q(pSrcB);
vecSum = vfmaq_m(vecSum, vecA, vecB, p0);
}
sum = vecAddAcrossF32Mve(vecSum);
/* Store result in destination buffer */
*result = sum;
}
#else
void arm_dot_prod_f32(
const float32_t * pSrcA,
const float32_t * pSrcB,
uint32_t blockSize,
float32_t * result)
{
uint32_t blkCnt; /* Loop counter */
float32_t sum = 0.0f; /* Temporary return variable */
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
f32x4_t vec1;
f32x4_t vec2;
f32x4_t accum = vdupq_n_f32(0);
#if !defined(__aarch64__)
f32x2_t tmp = vdup_n_f32(0);
#endif
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
vec1 = vld1q_f32(pSrcA);
vec2 = vld1q_f32(pSrcB);
while (blkCnt > 0U)
{
/* C = A[0]*B[0] + A[1]*B[1] + A[2]*B[2] + ... + A[blockSize-1]*B[blockSize-1] */
/* Calculate dot product and then store the result in a temporary buffer. */
accum = vmlaq_f32(accum, vec1, vec2);
/* Increment pointers */
pSrcA += 4;
pSrcB += 4;
vec1 = vld1q_f32(pSrcA);
vec2 = vld1q_f32(pSrcB);
/* Decrement the loop counter */
blkCnt--;
}
#if defined(__aarch64__)
sum = vpadds_f32(vpadd_f32(vget_low_f32(accum), vget_high_f32(accum)));
#else
tmp = vpadd_f32(vget_low_f32(accum), vget_high_f32(accum));
sum = vget_lane_f32(tmp, 0) + vget_lane_f32(tmp, 1);
#endif
/* Tail */
blkCnt = blockSize & 0x3;
#else
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
/* Calculate dot product and store result in a temporary buffer. */
sum += (*pSrcA++) * (*pSrcB++);
sum += (*pSrcA++) * (*pSrcB++);
sum += (*pSrcA++) * (*pSrcB++);
sum += (*pSrcA++) * (*pSrcB++);
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
#endif /* #if defined(ARM_MATH_NEON) */
while (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
/* Calculate dot product and store result in a temporary buffer. */
sum += (*pSrcA++) * (*pSrcB++);
/* Decrement loop counter */
blkCnt--;
}
/* Store result in destination buffer */
*result = sum;
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of BasicDotProd group
*/

View File

@@ -0,0 +1,78 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_dot_prod_f64.c
* Description: Floating-point dot product
*
* $Date: 13 September 2021
* $Revision: V1.10.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup BasicDotProd
@{
*/
/**
@brief Dot product of floating-point vectors.
@param[in] pSrcA points to the first input vector.
@param[in] pSrcB points to the second input vector.
@param[in] blockSize number of samples in each vector.
@param[out] result output result returned here.
@return none
*/
void arm_dot_prod_f64(
const float64_t * pSrcA,
const float64_t * pSrcB,
uint32_t blockSize,
float64_t * result)
{
uint32_t blkCnt; /* Loop counter */
float64_t sum = 0.; /* Temporary return variable */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
/* Calculate dot product and store result in a temporary buffer. */
sum += (*pSrcA++) * (*pSrcB++);
/* Decrement loop counter */
blkCnt--;
}
/* Store result in destination buffer */
*result = sum;
}
/**
@} end of BasicDotProd group
*/

View File

@@ -0,0 +1,172 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_dot_prod_q15.c
* Description: Q15 dot product
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup BasicDotProd
@{
*/
/**
@brief Dot product of Q15 vectors.
@param[in] pSrcA points to the first input vector
@param[in] pSrcB points to the second input vector
@param[in] blockSize number of samples in each vector
@param[out] result output result returned here
@return none
@par Scaling and Overflow Behavior
The intermediate multiplications are in 1.15 x 1.15 = 2.30 format and these
results are added to a 64-bit accumulator in 34.30 format.
Nonsaturating additions are used and given that there are 33 guard bits in the accumulator
there is no risk of overflow.
The return result is in 34.30 format.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_dot_prod_q15(
const q15_t * pSrcA,
const q15_t * pSrcB,
uint32_t blockSize,
q63_t * result)
{
uint32_t blkCnt; /* loop counters */
q15x8_t vecA;
q15x8_t vecB;
q63_t sum = 0LL;
/* Compute 8 outputs at a time */
blkCnt = blockSize >> 3;
while (blkCnt > 0U)
{
/*
* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1]
* Calculate dot product and then store the result in a temporary buffer.
*/
vecA = vld1q(pSrcA);
vecB = vld1q(pSrcB);
sum = vmlaldavaq(sum, vecA, vecB);
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
/*
* advance vector source and destination pointers
*/
pSrcA += 8;
pSrcB += 8;
}
/*
* tail
*/
blkCnt = blockSize & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecA = vld1q(pSrcA);
vecB = vld1q(pSrcB);
sum = vmlaldavaq_p(sum, vecA, vecB, p0);
}
*result = sum;
}
#else
void arm_dot_prod_q15(
const q15_t * pSrcA,
const q15_t * pSrcB,
uint32_t blockSize,
q63_t * result)
{
uint32_t blkCnt; /* Loop counter */
q63_t sum = 0; /* Temporary return variable */
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
#if defined (ARM_MATH_DSP)
/* Calculate dot product and store result in a temporary buffer. */
sum = __SMLALD(read_q15x2_ia (&pSrcA), read_q15x2_ia (&pSrcB), sum);
sum = __SMLALD(read_q15x2_ia (&pSrcA), read_q15x2_ia (&pSrcB), sum);
#else
sum += (q63_t)((q31_t) *pSrcA++ * *pSrcB++);
sum += (q63_t)((q31_t) *pSrcA++ * *pSrcB++);
sum += (q63_t)((q31_t) *pSrcA++ * *pSrcB++);
sum += (q63_t)((q31_t) *pSrcA++ * *pSrcB++);
#endif
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
/* Calculate dot product and store result in a temporary buffer. */
//#if defined (ARM_MATH_DSP)
// sum = __SMLALD(*pSrcA++, *pSrcB++, sum);
//#else
sum += (q63_t)((q31_t) *pSrcA++ * *pSrcB++);
//#endif
/* Decrement loop counter */
blkCnt--;
}
/* Store result in destination buffer in 34.30 format */
*result = sum;
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of BasicDotProd group
*/

View File

@@ -0,0 +1,174 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_dot_prod_q31.c
* Description: Q31 dot product
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup BasicDotProd
@{
*/
/**
@brief Dot product of Q31 vectors.
@param[in] pSrcA points to the first input vector.
@param[in] pSrcB points to the second input vector.
@param[in] blockSize number of samples in each vector.
@param[out] result output result returned here.
@return none
@par Scaling and Overflow Behavior
The intermediate multiplications are in 1.31 x 1.31 = 2.62 format and these
are truncated to 2.48 format by discarding the lower 14 bits.
The 2.48 result is then added without saturation to a 64-bit accumulator in 16.48 format.
There are 15 guard bits in the accumulator and there is no risk of overflow as long as
the length of the vectors is less than 2^16 elements.
The return result is in 16.48 format.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_dot_prod_q31(
const q31_t * pSrcA,
const q31_t * pSrcB,
uint32_t blockSize,
q63_t * result)
{
uint32_t blkCnt; /* loop counters */
q31x4_t vecA;
q31x4_t vecB;
q63_t sum = 0LL;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2;
while (blkCnt > 0U)
{
/*
* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1]
* Calculate dot product and then store the result in a temporary buffer.
*/
vecA = vld1q(pSrcA);
vecB = vld1q(pSrcB);
sum = vrmlaldavhaq(sum, vecA, vecB);
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
/*
* advance vector source and destination pointers
*/
pSrcA += 4;
pSrcB += 4;
}
/*
* tail
*/
blkCnt = blockSize & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
vecA = vld1q(pSrcA);
vecB = vld1q(pSrcB);
sum = vrmlaldavhaq_p(sum, vecA, vecB, p0);
}
/*
* vrmlaldavhaq provides extra intermediate accumulator headroom.
* limiting the need of intermediate scaling
* Scalar variant uses 2.48 accu format by right shifting accumulators by 14.
* 16.48 output conversion is performed outside the loop by scaling accu. by 6
*/
*result = asrl(sum, (14 - 8));
}
#else
void arm_dot_prod_q31(
const q31_t * pSrcA,
const q31_t * pSrcB,
uint32_t blockSize,
q63_t * result)
{
uint32_t blkCnt; /* Loop counter */
q63_t sum = 0; /* Temporary return variable */
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
/* Calculate dot product and store result in a temporary buffer. */
sum += ((q63_t) *pSrcA++ * *pSrcB++) >> 14U;
sum += ((q63_t) *pSrcA++ * *pSrcB++) >> 14U;
sum += ((q63_t) *pSrcA++ * *pSrcB++) >> 14U;
sum += ((q63_t) *pSrcA++ * *pSrcB++) >> 14U;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
/* Calculate dot product and store result in a temporary buffer. */
sum += ((q63_t) *pSrcA++ * *pSrcB++) >> 14U;
/* Decrement loop counter */
blkCnt--;
}
/* Store result in destination buffer in 16.48 format */
*result = sum;
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of BasicDotProd group
*/

View File

@@ -0,0 +1,191 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_dot_prod_q7.c
* Description: Q7 dot product
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup BasicDotProd
@{
*/
/**
@brief Dot product of Q7 vectors.
@param[in] pSrcA points to the first input vector
@param[in] pSrcB points to the second input vector
@param[in] blockSize number of samples in each vector
@param[out] result output result returned here
@return none
@par Scaling and Overflow Behavior
The intermediate multiplications are in 1.7 x 1.7 = 2.14 format and these
results are added to an accumulator in 18.14 format.
Nonsaturating additions are used and there is no danger of wrap around as long as
the vectors are less than 2^18 elements long.
The return result is in 18.14 format.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_dot_prod_q7(
const q7_t * pSrcA,
const q7_t * pSrcB,
uint32_t blockSize,
q31_t * result)
{
uint32_t blkCnt; /* loop counters */
q7x16_t vecA;
q7x16_t vecB;
q31_t sum = 0;
/* Compute 16 outputs at a time */
blkCnt = blockSize >> 4;
while (blkCnt > 0U)
{
/*
* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1]
* Calculate dot product and then store the result in a temporary buffer.
*/
vecA = vld1q(pSrcA);
vecB = vld1q(pSrcB);
sum = vmladavaq(sum, vecA, vecB);
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
/*
* advance vector source and destination pointers
*/
pSrcA += 16;
pSrcB += 16;
}
/*
* tail
*/
blkCnt = blockSize & 0xF;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp8q(blkCnt);
vecA = vld1q(pSrcA);
vecB = vld1q(pSrcB);
sum = vmladavaq_p(sum, vecA, vecB, p0);
}
*result = sum;
}
#else
void arm_dot_prod_q7(
const q7_t * pSrcA,
const q7_t * pSrcB,
uint32_t blockSize,
q31_t * result)
{
uint32_t blkCnt; /* Loop counter */
q31_t sum = 0; /* Temporary return variable */
#if defined (ARM_MATH_LOOPUNROLL)
#if defined (ARM_MATH_DSP)
q31_t input1, input2; /* Temporary variables */
q31_t inA1, inA2, inB1, inB2; /* Temporary variables */
#endif
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
#if defined (ARM_MATH_DSP)
/* read 4 samples at a time from sourceA */
input1 = read_q7x4_ia (&pSrcA);
/* read 4 samples at a time from sourceB */
input2 = read_q7x4_ia (&pSrcB);
/* extract two q7_t samples to q15_t samples */
inA1 = __SXTB16(__ROR(input1, 8));
/* extract reminaing two samples */
inA2 = __SXTB16(input1);
/* extract two q7_t samples to q15_t samples */
inB1 = __SXTB16(__ROR(input2, 8));
/* extract reminaing two samples */
inB2 = __SXTB16(input2);
/* multiply and accumulate two samples at a time */
sum = __SMLAD(inA1, inB1, sum);
sum = __SMLAD(inA2, inB2, sum);
#else
sum += (q31_t) ((q15_t) *pSrcA++ * *pSrcB++);
sum += (q31_t) ((q15_t) *pSrcA++ * *pSrcB++);
sum += (q31_t) ((q15_t) *pSrcA++ * *pSrcB++);
sum += (q31_t) ((q15_t) *pSrcA++ * *pSrcB++);
#endif
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
/* Calculate dot product and store result in a temporary buffer. */
//#if defined (ARM_MATH_DSP)
// sum = __SMLAD(*pSrcA++, *pSrcB++, sum);
//#else
sum += (q31_t) ((q15_t) *pSrcA++ * *pSrcB++);
//#endif
/* Decrement loop counter */
blkCnt--;
}
/* Store result in destination buffer in 18.14 format */
*result = sum;
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of BasicDotProd group
*/

View File

@@ -0,0 +1,171 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mult_f16.c
* Description: Floating-point vector multiplication
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions_f16.h"
/**
@ingroup groupMath
*/
/**
@defgroup BasicMult Vector Multiplication
Element-by-element multiplication of two vectors.
<pre>
pDst[n] = pSrcA[n] * pSrcB[n], 0 <= n < blockSize.
</pre>
There are separate functions for floating-point, Q7, Q15, and Q31 data types.
*/
/**
@addtogroup BasicMult
@{
*/
/**
@brief Floating-point vector multiplication.
@param[in] pSrcA points to the first input vector.
@param[in] pSrcB points to the second input vector.
@param[out] pDst points to the output vector.
@param[in] blockSize number of samples in each vector.
@return none
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_mult_f16(
const float16_t * pSrcA,
const float16_t * pSrcB,
float16_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
f16x8_t vec1;
f16x8_t vec2;
f16x8_t res;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 3U;
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and then store the results in the destination buffer. */
vec1 = vld1q(pSrcA);
vec2 = vld1q(pSrcB);
res = vmulq(vec1, vec2);
vst1q(pDst, res);
/* Increment pointers */
pSrcA += 8;
pSrcB += 8;
pDst += 8;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0x7;
if (blkCnt > 0U)
{
/* C = A + B */
mve_pred16_t p0 = vctp16q(blkCnt);
vec1 = vld1q(pSrcA);
vec2 = vld1q(pSrcB);
vstrhq_p(pDst, vmulq(vec1,vec2), p0);
}
}
#else
#if defined(ARM_FLOAT16_SUPPORTED)
void arm_mult_f16(
const float16_t * pSrcA,
const float16_t * pSrcB,
float16_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A * B */
/* Multiply inputs and store result in destination buffer. */
*pDst++ = (_Float16)(*pSrcA++) * (_Float16)(*pSrcB++);
*pDst++ = (_Float16)(*pSrcA++) * (_Float16)(*pSrcB++);
*pDst++ = (_Float16)(*pSrcA++) * (_Float16)(*pSrcB++);
*pDst++ = (_Float16)(*pSrcA++) * (_Float16)(*pSrcB++);
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A * B */
/* Multiply input and store result in destination buffer. */
*pDst++ = (_Float16)(*pSrcA++) * (_Float16)(*pSrcB++);
/* Decrement loop counter */
blkCnt--;
}
}
#endif
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of BasicMult group
*/

View File

@@ -0,0 +1,200 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mult_f32.c
* Description: Floating-point vector multiplication
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@defgroup BasicMult Vector Multiplication
Element-by-element multiplication of two vectors.
<pre>
pDst[n] = pSrcA[n] * pSrcB[n], 0 <= n < blockSize.
</pre>
There are separate functions for floating-point, Q7, Q15, and Q31 data types.
*/
/**
@addtogroup BasicMult
@{
*/
/**
@brief Floating-point vector multiplication.
@param[in] pSrcA points to the first input vector.
@param[in] pSrcB points to the second input vector.
@param[out] pDst points to the output vector.
@param[in] blockSize number of samples in each vector.
@return none
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_mult_f32(
const float32_t * pSrcA,
const float32_t * pSrcB,
float32_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
f32x4_t vec1;
f32x4_t vec2;
f32x4_t res;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and then store the results in the destination buffer. */
vec1 = vld1q(pSrcA);
vec2 = vld1q(pSrcB);
res = vmulq(vec1, vec2);
vst1q(pDst, res);
/* Increment pointers */
pSrcA += 4;
pSrcB += 4;
pDst += 4;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0x3;
if (blkCnt > 0U)
{
/* C = A + B */
mve_pred16_t p0 = vctp32q(blkCnt);
vec1 = vld1q(pSrcA);
vec2 = vld1q(pSrcB);
vstrwq_p(pDst, vmulq(vec1,vec2), p0);
}
}
#else
void arm_mult_f32(
const float32_t * pSrcA,
const float32_t * pSrcB,
float32_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
f32x4_t vec1;
f32x4_t vec2;
f32x4_t res;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A * B */
/* Multiply the inputs and then store the results in the destination buffer. */
vec1 = vld1q_f32(pSrcA);
vec2 = vld1q_f32(pSrcB);
res = vmulq_f32(vec1, vec2);
vst1q_f32(pDst, res);
/* Increment pointers */
pSrcA += 4;
pSrcB += 4;
pDst += 4;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0x3;
#else
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A * B */
/* Multiply inputs and store result in destination buffer. */
*pDst++ = (*pSrcA++) * (*pSrcB++);
*pDst++ = (*pSrcA++) * (*pSrcB++);
*pDst++ = (*pSrcA++) * (*pSrcB++);
*pDst++ = (*pSrcA++) * (*pSrcB++);
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
#endif /* #if defined(ARM_MATH_NEON) */
while (blkCnt > 0U)
{
/* C = A * B */
/* Multiply input and store result in destination buffer. */
*pDst++ = (*pSrcA++) * (*pSrcB++);
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of BasicMult group
*/

View File

@@ -0,0 +1,75 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mult_f64.c
* Description: Floating-point vector multiplication
*
* $Date: 13 September 2021
* $Revision: V1.10.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup BasicMult
@{
*/
/**
@brief Floating-point vector multiplication.
@param[in] pSrcA points to the first input vector.
@param[in] pSrcB points to the second input vector.
@param[out] pDst points to the output vector.
@param[in] blockSize number of samples in each vector.
@return none
*/
void arm_mult_f64(
const float64_t * pSrcA,
const float64_t * pSrcB,
float64_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* C = A * B */
/* Multiply input and store result in destination buffer. */
*pDst++ = (*pSrcA++) * (*pSrcB++);
/* Decrement loop counter */
blkCnt--;
}
}
/**
@} end of BasicMult group
*/

View File

@@ -0,0 +1,192 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mult_q15.c
* Description: Q15 vector multiplication
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup BasicMult
@{
*/
/**
@brief Q15 vector multiplication
@param[in] pSrcA points to first input vector
@param[in] pSrcB points to second input vector
@param[out] pDst points to output vector
@param[in] blockSize number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_mult_q15(
const q15_t * pSrcA,
const q15_t * pSrcB,
q15_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counters */
q15x8_t vecA, vecB;
/* Compute 8 outputs at a time */
blkCnt = blockSize >> 3;
while (blkCnt > 0U)
{
/*
* C = A * B
* Multiply the inputs and then store the results in the destination buffer.
*/
vecA = vld1q(pSrcA);
vecB = vld1q(pSrcB);
vst1q(pDst, vqdmulhq(vecA, vecB));
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
/*
* advance vector source and destination pointers
*/
pSrcA += 8;
pSrcB += 8;
pDst += 8;
}
/*
* tail
*/
blkCnt = blockSize & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecA = vld1q(pSrcA);
vecB = vld1q(pSrcB);
vstrhq_p(pDst, vqdmulhq(vecA, vecB), p0);
}
}
#else
void arm_mult_q15(
const q15_t * pSrcA,
const q15_t * pSrcB,
q15_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined (ARM_MATH_LOOPUNROLL)
#if defined (ARM_MATH_DSP)
q31_t inA1, inA2, inB1, inB2; /* Temporary input variables */
q15_t out1, out2, out3, out4; /* Temporary output variables */
q31_t mul1, mul2, mul3, mul4; /* Temporary variables */
#endif
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A * B */
#if defined (ARM_MATH_DSP)
/* read 2 samples at a time from sourceA */
inA1 = read_q15x2_ia (&pSrcA);
/* read 2 samples at a time from sourceB */
inB1 = read_q15x2_ia (&pSrcB);
/* read 2 samples at a time from sourceA */
inA2 = read_q15x2_ia (&pSrcA);
/* read 2 samples at a time from sourceB */
inB2 = read_q15x2_ia (&pSrcB);
/* multiply mul = sourceA * sourceB */
mul1 = (q31_t) ((q15_t) (inA1 >> 16) * (q15_t) (inB1 >> 16));
mul2 = (q31_t) ((q15_t) (inA1 ) * (q15_t) (inB1 ));
mul3 = (q31_t) ((q15_t) (inA2 >> 16) * (q15_t) (inB2 >> 16));
mul4 = (q31_t) ((q15_t) (inA2 ) * (q15_t) (inB2 ));
/* saturate result to 16 bit */
out1 = (q15_t) __SSAT(mul1 >> 15, 16);
out2 = (q15_t) __SSAT(mul2 >> 15, 16);
out3 = (q15_t) __SSAT(mul3 >> 15, 16);
out4 = (q15_t) __SSAT(mul4 >> 15, 16);
/* store result to destination */
#ifndef ARM_MATH_BIG_ENDIAN
write_q15x2_ia (&pDst, __PKHBT(out2, out1, 16));
write_q15x2_ia (&pDst, __PKHBT(out4, out3, 16));
#else
write_q15x2_ia (&pDst, __PKHBT(out1, out2, 16));
write_q15x2_ia (&pDst, __PKHBT(out3, out4, 16));
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
#else
*pDst++ = (q15_t) __SSAT((((q31_t) (*pSrcA++) * (*pSrcB++)) >> 15), 16);
*pDst++ = (q15_t) __SSAT((((q31_t) (*pSrcA++) * (*pSrcB++)) >> 15), 16);
*pDst++ = (q15_t) __SSAT((((q31_t) (*pSrcA++) * (*pSrcB++)) >> 15), 16);
*pDst++ = (q15_t) __SSAT((((q31_t) (*pSrcA++) * (*pSrcB++)) >> 15), 16);
#endif
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A * B */
/* Multiply inputs and store result in destination buffer. */
*pDst++ = (q15_t) __SSAT((((q31_t) (*pSrcA++) * (*pSrcB++)) >> 15), 16);
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of BasicMult group
*/

View File

@@ -0,0 +1,168 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mult_q31.c
* Description: Q31 vector multiplication
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup BasicMult
@{
*/
/**
@brief Q31 vector multiplication.
@param[in] pSrcA points to the first input vector.
@param[in] pSrcB points to the second input vector.
@param[out] pDst points to the output vector.
@param[in] blockSize number of samples in each vector.
@return none
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q31 range[0x80000000 0x7FFFFFFF] are saturated.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_mult_q31(
const q31_t * pSrcA,
const q31_t * pSrcB,
q31_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counters */
q31x4_t vecA, vecB;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2;
while (blkCnt > 0U)
{
/*
* C = A * B
* Multiply the inputs and then store the results in the destination buffer.
*/
vecA = vld1q(pSrcA);
vecB = vld1q(pSrcB);
vst1q(pDst, vqdmulhq(vecA, vecB));
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
/*
* advance vector source and destination pointers
*/
pSrcA += 4;
pSrcB += 4;
pDst += 4;
}
/*
* tail
*/
blkCnt = blockSize & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
vecA = vld1q(pSrcA);
vecB = vld1q(pSrcB);
vstrwq_p(pDst, vqdmulhq(vecA, vecB), p0);
}
}
#else
void arm_mult_q31(
const q31_t * pSrcA,
const q31_t * pSrcB,
q31_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
q31_t out; /* Temporary output variable */
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A * B */
/* Multiply inputs and store result in destination buffer. */
out = ((q63_t) *pSrcA++ * *pSrcB++) >> 32;
out = __SSAT(out, 31);
*pDst++ = out << 1U;
out = ((q63_t) *pSrcA++ * *pSrcB++) >> 32;
out = __SSAT(out, 31);
*pDst++ = out << 1U;
out = ((q63_t) *pSrcA++ * *pSrcB++) >> 32;
out = __SSAT(out, 31);
*pDst++ = out << 1U;
out = ((q63_t) *pSrcA++ * *pSrcB++) >> 32;
out = __SSAT(out, 31);
*pDst++ = out << 1U;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A * B */
/* Multiply inputs and store result in destination buffer. */
out = ((q63_t) *pSrcA++ * *pSrcB++) >> 32;
out = __SSAT(out, 31);
*pDst++ = out << 1U;
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of BasicMult group
*/

View File

@@ -0,0 +1,168 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mult_q7.c
* Description: Q7 vector multiplication
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup BasicMult
@{
*/
/**
@brief Q7 vector multiplication
@param[in] pSrcA points to the first input vector
@param[in] pSrcB points to the second input vector
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q7 range [0x80 0x7F] are saturated.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_mult_q7(
const q7_t * pSrcA,
const q7_t * pSrcB,
q7_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counters */
q7x16_t vecA, vecB;
/* Compute 16 outputs at a time */
blkCnt = blockSize >> 4;
while (blkCnt > 0U)
{
/*
* C = A * B
* Multiply the inputs and then store the results in the destination buffer.
*/
vecA = vld1q(pSrcA);
vecB = vld1q(pSrcB);
vst1q(pDst, vqdmulhq(vecA, vecB));
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
/*
* advance vector source and destination pointers
*/
pSrcA += 16;
pSrcB += 16;
pDst += 16;
}
/*
* tail
*/
blkCnt = blockSize & 0xF;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp8q(blkCnt);
vecA = vld1q(pSrcA);
vecB = vld1q(pSrcB);
vstrbq_p(pDst, vqdmulhq(vecA, vecB), p0);
}
}
#else
void arm_mult_q7(
const q7_t * pSrcA,
const q7_t * pSrcB,
q7_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined (ARM_MATH_LOOPUNROLL)
#if defined (ARM_MATH_DSP)
q7_t out1, out2, out3, out4; /* Temporary output variables */
#endif
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A * B */
#if defined (ARM_MATH_DSP)
/* Multiply inputs and store results in temporary variables */
out1 = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
out2 = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
out3 = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
out4 = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
/* Pack and store result in destination buffer (in single write) */
write_q7x4_ia (&pDst, __PACKq7(out1, out2, out3, out4));
#else
*pDst++ = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
*pDst++ = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
*pDst++ = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
*pDst++ = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
#endif
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A * B */
/* Multiply input and store result in destination buffer. */
*pDst++ = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of BasicMult group
*/

View File

@@ -0,0 +1,166 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_negate_f16.c
* Description: Negates floating-point vectors
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions_f16.h"
/**
@ingroup groupMath
*/
/**
@defgroup BasicNegate Vector Negate
Negates the elements of a vector.
<pre>
pDst[n] = -pSrc[n], 0 <= n < blockSize.
</pre>
The functions support in-place computation allowing the source and
destination pointers to reference the same memory buffer.
There are separate functions for floating-point, Q7, Q15, and Q31 data types.
*/
/**
@addtogroup BasicNegate
@{
*/
/**
@brief Negates the elements of a floating-point vector.
@param[in] pSrc points to input vector.
@param[out] pDst points to output vector.
@param[in] blockSize number of samples in each vector.
@return none
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_negate_f16(
const float16_t * pSrc,
float16_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
f16x8_t vec1;
f16x8_t res;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 3U;
while (blkCnt > 0U)
{
/* C = |A| */
/* Calculate absolute values and then store the results in the destination buffer. */
vec1 = vld1q(pSrc);
res = vnegq(vec1);
vst1q(pDst, res);
/* Increment pointers */
pSrc += 8;
pDst += 8;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0x7;
if (blkCnt > 0U)
{
/* C = |A| */
mve_pred16_t p0 = vctp16q(blkCnt);
vec1 = vld1q((float16_t const *) pSrc);
vstrhq_p(pDst, vnegq(vec1), p0);
}
}
#else
#if defined(ARM_FLOAT16_SUPPORTED)
void arm_negate_f16(
const float16_t * pSrc,
float16_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = -A */
/* Negate and store result in destination buffer. */
*pDst++ = -(_Float16)*pSrc++;
*pDst++ = -(_Float16)*pSrc++;
*pDst++ = -(_Float16)*pSrc++;
*pDst++ = -(_Float16)*pSrc++;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = -A */
/* Negate and store result in destination buffer. */
*pDst++ = -(_Float16)*pSrc++;
/* Decrement loop counter */
blkCnt--;
}
}
#endif
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of BasicNegate group
*/

View File

@@ -0,0 +1,192 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_negate_f32.c
* Description: Negates floating-point vectors
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@defgroup BasicNegate Vector Negate
Negates the elements of a vector.
<pre>
pDst[n] = -pSrc[n], 0 <= n < blockSize.
</pre>
The functions support in-place computation allowing the source and
destination pointers to reference the same memory buffer.
There are separate functions for floating-point, Q7, Q15, and Q31 data types.
*/
/**
@addtogroup BasicNegate
@{
*/
/**
@brief Negates the elements of a floating-point vector.
@param[in] pSrc points to input vector.
@param[out] pDst points to output vector.
@param[in] blockSize number of samples in each vector.
@return none
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_negate_f32(
const float32_t * pSrc,
float32_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
f32x4_t vec1;
f32x4_t res;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = |A| */
/* Calculate absolute values and then store the results in the destination buffer. */
vec1 = vld1q(pSrc);
res = vnegq(vec1);
vst1q(pDst, res);
/* Increment pointers */
pSrc += 4;
pDst += 4;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0x3;
if (blkCnt > 0U)
{
/* C = |A| */
mve_pred16_t p0 = vctp32q(blkCnt);
vec1 = vld1q((float32_t const *) pSrc);
vstrwq_p(pDst, vnegq(vec1), p0);
}
}
#else
void arm_negate_f32(
const float32_t * pSrc,
float32_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined(ARM_MATH_NEON_EXPERIMENTAL) && !defined(ARM_MATH_AUTOVECTORIZE)
f32x4_t vec1;
f32x4_t res;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = -A */
/* Negate and then store the results in the destination buffer. */
vec1 = vld1q_f32(pSrc);
res = vnegq_f32(vec1);
vst1q_f32(pDst, res);
/* Increment pointers */
pSrc += 4;
pDst += 4;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0x3;
#else
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = -A */
/* Negate and store result in destination buffer. */
*pDst++ = -*pSrc++;
*pDst++ = -*pSrc++;
*pDst++ = -*pSrc++;
*pDst++ = -*pSrc++;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
#endif /* #if defined(ARM_MATH_NEON_EXPERIMENTAL) */
while (blkCnt > 0U)
{
/* C = -A */
/* Negate and store result in destination buffer. */
*pDst++ = -*pSrc++;
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of BasicNegate group
*/

View File

@@ -0,0 +1,73 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_negate_f64.c
* Description: Negates floating-point vectors
*
* $Date: 13 September 2021
* $Revision: V1.10.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup BasicNegate
@{
*/
/**
@brief Negates the elements of a floating-point vector.
@param[in] pSrc points to input vector.
@param[out] pDst points to output vector.
@param[in] blockSize number of samples in each vector.
@return none
*/
void arm_negate_f64(
const float64_t * pSrc,
float64_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* C = -A */
/* Negate and store result in destination buffer. */
*pDst++ = -*pSrc++;
/* Decrement loop counter */
blkCnt--;
}
}
/**
@} end of BasicNegate group
*/

View File

@@ -0,0 +1,171 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_negate_q15.c
* Description: Negates Q15 vectors
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup BasicNegate
@{
*/
/**
@brief Negates the elements of a Q15 vector.
@param[in] pSrc points to the input vector.
@param[out] pDst points to the output vector.
@param[in] blockSize number of samples in each vector.
@return none
@par Conditions for optimum performance
Input and output buffers should be aligned by 32-bit
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
The Q15 value -1 (0x8000) is saturated to the maximum allowable positive value 0x7FFF.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_negate_q15(
const q15_t * pSrc,
q15_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counters */
q15x8_t vecSrc;
/* Compute 8 outputs at a time */
blkCnt = blockSize >> 3;
while (blkCnt > 0U)
{
/*
* C = -A
* Negate and then store the results in the destination buffer.
*/
vecSrc = vld1q(pSrc);
vst1q(pDst, vqnegq(vecSrc));
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
/*
* advance vector source and destination pointers
*/
pSrc += 8;
pDst += 8;
}
/*
* tail
*/
blkCnt = blockSize & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecSrc = vld1q(pSrc);
vstrhq_p(pDst, vqnegq(vecSrc), p0);
}
}
#else
void arm_negate_q15(
const q15_t * pSrc,
q15_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
q15_t in; /* Temporary input variable */
#if defined (ARM_MATH_LOOPUNROLL)
#if defined (ARM_MATH_DSP)
q31_t in1; /* Temporary input variables */
#endif
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = -A */
#if defined (ARM_MATH_DSP)
/* Negate and store result in destination buffer (2 samples at a time). */
in1 = read_q15x2_ia (&pSrc);
write_q15x2_ia (&pDst, __QSUB16(0, in1));
in1 = read_q15x2_ia (&pSrc);
write_q15x2_ia (&pDst, __QSUB16(0, in1));
#else
in = *pSrc++;
*pDst++ = (in == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in;
in = *pSrc++;
*pDst++ = (in == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in;
in = *pSrc++;
*pDst++ = (in == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in;
in = *pSrc++;
*pDst++ = (in == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in;
#endif
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = -A */
/* Negate and store result in destination buffer. */
in = *pSrc++;
*pDst++ = (in == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in;
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of BasicNegate group
*/

View File

@@ -0,0 +1,178 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_negate_q31.c
* Description: Negates Q31 vectors
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup BasicNegate
@{
*/
/**
@brief Negates the elements of a Q31 vector.
@param[in] pSrc points to the input vector.
@param[out] pDst points to the output vector.
@param[in] blockSize number of samples in each vector.
@return none
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
The Q31 value -1 (0x80000000) is saturated to the maximum allowable positive value 0x7FFFFFFF.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_negate_q31(
const q31_t * pSrc,
q31_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counters */
q31x4_t vecSrc;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2;
while (blkCnt > 0U)
{
/*
* C = -A
* Negate and then store the results in the destination buffer.
*/
vecSrc = vld1q(pSrc);
vst1q(pDst, vqnegq(vecSrc));
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
/*
* advance vector source and destination pointers
*/
pSrc += 4;
pDst += 4;
}
/*
* tail
*/
blkCnt = blockSize & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
vecSrc = vld1q(pSrc);
vstrwq_p(pDst, vqnegq(vecSrc), p0);
}
}
#else
void arm_negate_q31(
const q31_t * pSrc,
q31_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
q31_t in; /* Temporary input variable */
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = -A */
/* Negate and store result in destination buffer. */
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = __QSUB(0, in);
#else
*pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
#endif
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = __QSUB(0, in);
#else
*pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
#endif
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = __QSUB(0, in);
#else
*pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
#endif
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = __QSUB(0, in);
#else
*pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
#endif
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = -A */
/* Negate and store result in destination buffer. */
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = __QSUB(0, in);
#else
*pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
#endif
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of BasicNegate group
*/

View File

@@ -0,0 +1,171 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_negate_q7.c
* Description: Negates Q7 vectors
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup BasicNegate
@{
*/
/**
@brief Negates the elements of a Q7 vector.
@param[in] pSrc points to the input vector.
@param[out] pDst points to the output vector.
@param[in] blockSize number of samples in each vector.
@return none
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
The Q7 value -1 (0x80) is saturated to the maximum allowable positive value 0x7F.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_negate_q7(
const q7_t * pSrc,
q7_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counters */
q7x16_t vecSrc;
/* Compute 16 outputs at a time */
blkCnt = blockSize >> 4;
while (blkCnt > 0U)
{
/*
* C = -A
* Negate and then store the results in the destination buffer.
*/
vecSrc = vld1q(pSrc);
vst1q(pDst, vqnegq(vecSrc));
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
/*
* advance vector source and destination pointers
*/
pSrc += 16;
pDst += 16;
}
/*
* tail
*/
blkCnt = blockSize & 0xF;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp8q(blkCnt);
vecSrc = vld1q(pSrc);
vstrbq_p(pDst, vqnegq(vecSrc), p0);
}
}
#else
void arm_negate_q7(
const q7_t * pSrc,
q7_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
q7_t in; /* Temporary input variable */
#if defined (ARM_MATH_LOOPUNROLL)
#if defined (ARM_MATH_DSP)
q31_t in1; /* Temporary input variable */
#endif
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = -A */
#if defined (ARM_MATH_DSP)
/* Negate and store result in destination buffer (4 samples at a time). */
in1 = read_q7x4_ia (&pSrc);
write_q7x4_ia (&pDst, __QSUB8(0, in1));
#else
in = *pSrc++;
*pDst++ = (in == (q7_t) 0x80) ? (q7_t) 0x7f : -in;
in = *pSrc++;
*pDst++ = (in == (q7_t) 0x80) ? (q7_t) 0x7f : -in;
in = *pSrc++;
*pDst++ = (in == (q7_t) 0x80) ? (q7_t) 0x7f : -in;
in = *pSrc++;
*pDst++ = (in == (q7_t) 0x80) ? (q7_t) 0x7f : -in;
#endif
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = -A */
/* Negate and store result in destination buffer. */
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = (q7_t) __QSUB8(0, in);
#else
*pDst++ = (in == (q7_t) 0x80) ? (q7_t) 0x7f : -in;
#endif
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of BasicNegate group
*/

View File

@@ -0,0 +1,130 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_not_u16.c
* Description: uint16_t bitwise NOT
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@defgroup Not Vector bitwise NOT
Compute the logical bitwise NOT.
There are separate functions for uint32_t, uint16_t, and uint8_t data types.
*/
/**
@addtogroup Not
@{
*/
/**
@brief Compute the logical bitwise NOT of a fixed-point vector.
@param[in] pSrc points to input vector
@param[out] pDst points to output vector
@param[in] blockSize number of samples in each vector
@return none
*/
void arm_not_u16(
const uint16_t * pSrc,
uint16_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
uint16x8_t vecSrc;
/* Compute 8 outputs at a time */
blkCnt = blockSize >> 3;
while (blkCnt > 0U)
{
vecSrc = vld1q(pSrc);
vst1q(pDst, vmvnq_u16(vecSrc) );
pSrc += 8;
pDst += 8;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecSrc = vld1q(pSrc);
vstrhq_p(pDst, vmvnq_u16(vecSrc), p0);
}
#else
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
uint16x8_t inV;
/* Compute 8 outputs at a time */
blkCnt = blockSize >> 3U;
while (blkCnt > 0U)
{
inV = vld1q_u16(pSrc);
vst1q_u16(pDst, vmvnq_u16(inV) );
pSrc += 8;
pDst += 8;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 7;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif
while (blkCnt > 0U)
{
*pDst++ = ~(*pSrc++);
/* Decrement the loop counter */
blkCnt--;
}
#endif /* if defined(ARM_MATH_MVEI) */
}
/**
@} end of Not group
*/

View File

@@ -0,0 +1,122 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_not_u32.c
* Description: uint32_t bitwise NOT
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup Not
@{
*/
/**
@brief Compute the logical bitwise NOT of a fixed-point vector.
@param[in] pSrc points to input vector
@param[out] pDst points to output vector
@param[in] blockSize number of samples in each vector
@return none
*/
void arm_not_u32(
const uint32_t * pSrc,
uint32_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
uint32x4_t vecSrc;
/* Compute 8 outputs at a time */
blkCnt = blockSize >> 2;
while (blkCnt > 0U)
{
vecSrc = vld1q(pSrc);
vst1q(pDst, vmvnq_u32(vecSrc) );
pSrc += 4;
pDst += 4;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
vecSrc = vld1q(pSrc);
vstrwq_p(pDst, vmvnq_u32(vecSrc), p0);
}
#else
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
uint32x4_t inV;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
inV = vld1q_u32(pSrc);
vst1q_u32(pDst, vmvnq_u32(inV) );
pSrc += 4;
pDst += 4;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 3;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif
while (blkCnt > 0U)
{
*pDst++ = ~(*pSrc++);
/* Decrement the loop counter */
blkCnt--;
}
#endif /* if defined(ARM_MATH_MVEI) */
}
/**
@} end of Not group
*/

View File

@@ -0,0 +1,122 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_not_u8.c
* Description: uint8_t bitwise NOT
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup Not
@{
*/
/**
@brief Compute the logical bitwise NOT of a fixed-point vector.
@param[in] pSrc points to input vector
@param[out] pDst points to output vector
@param[in] blockSize number of samples in each vector
@return none
*/
void arm_not_u8(
const uint8_t * pSrc,
uint8_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
uint8x16_t vecSrc;
/* Compute 16 outputs at a time */
blkCnt = blockSize >> 4;
while (blkCnt > 0U)
{
vecSrc = vld1q(pSrc);
vst1q(pDst, vmvnq_u8(vecSrc) );
pSrc += 16;
pDst += 16;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0xF;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp8q(blkCnt);
vecSrc = vld1q(pSrc);
vstrbq_p(pDst, vmvnq_u8(vecSrc), p0);
}
#else
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
uint8x16_t inV;
/* Compute 16 outputs at a time */
blkCnt = blockSize >> 4U;
while (blkCnt > 0U)
{
inV = vld1q_u8(pSrc);
vst1q_u8(pDst, vmvnq_u8(inV) );
pSrc += 16;
pDst += 16;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0xF;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif
while (blkCnt > 0U)
{
*pDst++ = ~(*pSrc++);
/* Decrement the loop counter */
blkCnt--;
}
#endif /* if defined(ARM_MATH_MVEI) */
}
/**
@} end of Not group
*/

View File

@@ -0,0 +1,170 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_offset_f16.c
* Description: Floating-point vector offset
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions_f16.h"
/**
@ingroup groupMath
*/
/**
@defgroup BasicOffset Vector Offset
Adds a constant offset to each element of a vector.
<pre>
pDst[n] = pSrc[n] + offset, 0 <= n < blockSize.
</pre>
The functions support in-place computation allowing the source and
destination pointers to reference the same memory buffer.
There are separate functions for floating-point, Q7, Q15, and Q31 data types.
*/
/**
@addtogroup BasicOffset
@{
*/
/**
@brief Adds a constant offset to a floating-point vector.
@param[in] pSrc points to the input vector
@param[in] offset is the offset to be added
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_offset_f16(
const float16_t * pSrc,
float16_t offset,
float16_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
f16x8_t vec1;
f16x8_t res;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 3U;
while (blkCnt > 0U)
{
/* C = A + offset */
/* Add offset and then store the results in the destination buffer. */
vec1 = vld1q(pSrc);
res = vaddq(vec1,offset);
vst1q(pDst, res);
/* Increment pointers */
pSrc += 8;
pDst += 8;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0x7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vec1 = vld1q((float16_t const *) pSrc);
vstrhq_p(pDst, vaddq(vec1, offset), p0);
}
}
#else
#if defined(ARM_FLOAT16_SUPPORTED)
void arm_offset_f16(
const float16_t * pSrc,
float16_t offset,
float16_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A + offset */
/* Add offset and store result in destination buffer. */
*pDst++ = (_Float16)(*pSrc++) + (_Float16)offset;
*pDst++ = (_Float16)(*pSrc++) + (_Float16)offset;
*pDst++ = (_Float16)(*pSrc++) + (_Float16)offset;
*pDst++ = (_Float16)(*pSrc++) + (_Float16)offset;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A + offset */
/* Add offset and store result in destination buffer. */
*pDst++ = (_Float16)(*pSrc++) + (_Float16)offset;
/* Decrement loop counter */
blkCnt--;
}
}
#endif
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of BasicOffset group
*/

View File

@@ -0,0 +1,196 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_offset_f32.c
* Description: Floating-point vector offset
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@defgroup BasicOffset Vector Offset
Adds a constant offset to each element of a vector.
<pre>
pDst[n] = pSrc[n] + offset, 0 <= n < blockSize.
</pre>
The functions support in-place computation allowing the source and
destination pointers to reference the same memory buffer.
There are separate functions for floating-point, Q7, Q15, and Q31 data types.
*/
/**
@addtogroup BasicOffset
@{
*/
/**
@brief Adds a constant offset to a floating-point vector.
@param[in] pSrc points to the input vector
@param[in] offset is the offset to be added
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_offset_f32(
const float32_t * pSrc,
float32_t offset,
float32_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
f32x4_t vec1;
f32x4_t res;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A + offset */
/* Add offset and then store the results in the destination buffer. */
vec1 = vld1q(pSrc);
res = vaddq(vec1,offset);
vst1q(pDst, res);
/* Increment pointers */
pSrc += 4;
pDst += 4;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0x3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
vec1 = vld1q((float32_t const *) pSrc);
vstrwq_p(pDst, vaddq(vec1, offset), p0);
}
}
#else
void arm_offset_f32(
const float32_t * pSrc,
float32_t offset,
float32_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined(ARM_MATH_NEON_EXPERIMENTAL) && !defined(ARM_MATH_AUTOVECTORIZE)
f32x4_t vec1;
f32x4_t res;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A + offset */
/* Add offset and then store the results in the destination buffer. */
vec1 = vld1q_f32(pSrc);
res = vaddq_f32(vec1,vdupq_n_f32(offset));
vst1q_f32(pDst, res);
/* Increment pointers */
pSrc += 4;
pDst += 4;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0x3;
#else
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A + offset */
/* Add offset and store result in destination buffer. */
*pDst++ = (*pSrc++) + offset;
*pDst++ = (*pSrc++) + offset;
*pDst++ = (*pSrc++) + offset;
*pDst++ = (*pSrc++) + offset;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
#endif /* #if defined(ARM_MATH_NEON_EXPERIMENTAL) */
while (blkCnt > 0U)
{
/* C = A + offset */
/* Add offset and store result in destination buffer. */
*pDst++ = (*pSrc++) + offset;
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of BasicOffset group
*/

View File

@@ -0,0 +1,75 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_offset_f64.c
* Description: Floating-point vector offset
*
* $Date: 13 September 2021
* $Revision: V1.10.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup BasicOffset
@{
*/
/**
@brief Adds a constant offset to a floating-point vector.
@param[in] pSrc points to the input vector
@param[in] offset is the offset to be added
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
*/
void arm_offset_f64(
const float64_t * pSrc,
float64_t offset,
float64_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* C = A + offset */
/* Add offset and store result in destination buffer. */
*pDst++ = (*pSrc++) + offset;
/* Decrement loop counter */
blkCnt--;
}
}
/**
@} end of BasicOffset group
*/

View File

@@ -0,0 +1,168 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_offset_q15.c
* Description: Q15 vector offset
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup BasicOffset
@{
*/
/**
@brief Adds a constant offset to a Q15 vector.
@param[in] pSrc points to the input vector
@param[in] offset is the offset to be added
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_offset_q15(
const q15_t * pSrc,
q15_t offset,
q15_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counters */
q15x8_t vecSrc;
/* Compute 8 outputs at a time */
blkCnt = blockSize >> 3;
while (blkCnt > 0U)
{
/*
* C = A + offset
* Add offset and then store the result in the destination buffer.
*/
vecSrc = vld1q(pSrc);
vst1q(pDst, vqaddq(vecSrc, offset));
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
/*
* advance vector source and destination pointers
*/
pSrc += 8;
pDst += 8;
}
/*
* tail
*/
blkCnt = blockSize & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecSrc = vld1q(pSrc);
vstrhq_p(pDst, vqaddq(vecSrc, offset), p0);
}
}
#else
void arm_offset_q15(
const q15_t * pSrc,
q15_t offset,
q15_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined (ARM_MATH_LOOPUNROLL)
#if defined (ARM_MATH_DSP)
q31_t offset_packed; /* Offset packed to 32 bit */
/* Offset is packed to 32 bit in order to use SIMD32 for addition */
offset_packed = __PKHBT(offset, offset, 16);
#endif
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A + offset */
#if defined (ARM_MATH_DSP)
/* Add offset and store result in destination buffer (2 samples at a time). */
write_q15x2_ia (&pDst, __QADD16(read_q15x2_ia (&pSrc), offset_packed));
write_q15x2_ia (&pDst, __QADD16(read_q15x2_ia (&pSrc), offset_packed));
#else
*pDst++ = (q15_t) __SSAT(((q31_t) *pSrc++ + offset), 16);
*pDst++ = (q15_t) __SSAT(((q31_t) *pSrc++ + offset), 16);
*pDst++ = (q15_t) __SSAT(((q31_t) *pSrc++ + offset), 16);
*pDst++ = (q15_t) __SSAT(((q31_t) *pSrc++ + offset), 16);
#endif
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A + offset */
/* Add offset and store result in destination buffer. */
#if defined (ARM_MATH_DSP)
*pDst++ = (q15_t) __QADD16(*pSrc++, offset);
#else
*pDst++ = (q15_t) __SSAT(((q31_t) *pSrc++ + offset), 16);
#endif
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of BasicOffset group
*/

View File

@@ -0,0 +1,159 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_offset_q31.c
* Description: Q31 vector offset
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup BasicOffset
@{
*/
/**
@brief Adds a constant offset to a Q31 vector.
@param[in] pSrc points to the input vector
@param[in] offset is the offset to be added
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_offset_q31(
const q31_t * pSrc,
q31_t offset,
q31_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counters */
q31x4_t vecSrc;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2;
while (blkCnt > 0U)
{
/*
* C = A + offset
* Add offset and then store the result in the destination buffer.
*/
vecSrc = vld1q(pSrc);
vst1q(pDst, vqaddq(vecSrc, offset));
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
/*
* advance vector source and destination pointers
*/
pSrc += 4;
pDst += 4;
}
/*
* tail
*/
blkCnt = blockSize & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
vecSrc = vld1q(pSrc);
vstrwq_p(pDst, vqaddq(vecSrc, offset), p0);
}
}
#else
void arm_offset_q31(
const q31_t * pSrc,
q31_t offset,
q31_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A + offset */
/* Add offset and store result in destination buffer. */
*pDst++ = __QADD(*pSrc++, offset);
*pDst++ = __QADD(*pSrc++, offset);
*pDst++ = __QADD(*pSrc++, offset);
*pDst++ = __QADD(*pSrc++, offset);
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A + offset */
/* Add offset and store result in destination buffer. */
#if defined (ARM_MATH_DSP)
*pDst++ = __QADD(*pSrc++, offset);
#else
*pDst++ = (q31_t) clip_q63_to_q31((q63_t) * pSrc++ + offset);
#endif
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of BasicOffset group
*/

View File

@@ -0,0 +1,162 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_offset_q7.c
* Description: Q7 vector offset
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup BasicOffset
@{
*/
/**
@brief Adds a constant offset to a Q7 vector.
@param[in] pSrc points to the input vector
@param[in] offset is the offset to be added
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q7 range [0x80 0x7F] are saturated.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_offset_q7(
const q7_t * pSrc,
q7_t offset,
q7_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counters */
q7x16_t vecSrc;
/* Compute 16 outputs at a time */
blkCnt = blockSize >> 4;
while (blkCnt > 0U)
{
/*
* C = A + offset
* Add offset and then store the result in the destination buffer.
*/
vecSrc = vld1q(pSrc);
vst1q(pDst, vqaddq(vecSrc, offset));
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
/*
* advance vector source and destination pointers
*/
pSrc += 16;
pDst += 16;
}
/*
* tail
*/
blkCnt = blockSize & 0xF;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp8q(blkCnt);
vecSrc = vld1q(pSrc);
vstrbq_p(pDst, vqaddq(vecSrc, offset), p0);
}
}
#else
void arm_offset_q7(
const q7_t * pSrc,
q7_t offset,
q7_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined (ARM_MATH_LOOPUNROLL)
#if defined (ARM_MATH_DSP)
q31_t offset_packed; /* Offset packed to 32 bit */
/* Offset is packed to 32 bit in order to use SIMD32 for addition */
offset_packed = __PACKq7(offset, offset, offset, offset);
#endif
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A + offset */
#if defined (ARM_MATH_DSP)
/* Add offset and store result in destination buffer (4 samples at a time). */
write_q7x4_ia (&pDst, __QADD8(read_q7x4_ia (&pSrc), offset_packed));
#else
*pDst++ = (q7_t) __SSAT((q15_t) *pSrc++ + offset, 8);
*pDst++ = (q7_t) __SSAT((q15_t) *pSrc++ + offset, 8);
*pDst++ = (q7_t) __SSAT((q15_t) *pSrc++ + offset, 8);
*pDst++ = (q7_t) __SSAT((q15_t) *pSrc++ + offset, 8);
#endif
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A + offset */
/* Add offset and store result in destination buffer. */
*pDst++ = (q7_t) __SSAT((q15_t) *pSrc++ + offset, 8);
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of BasicOffset group
*/

View File

@@ -0,0 +1,137 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_or_u16.c
* Description: uint16_t bitwise inclusive OR
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@defgroup Or Vector bitwise inclusive OR
Compute the logical bitwise OR.
There are separate functions for uint32_t, uint16_t, and uint8_t data types.
*/
/**
@addtogroup Or
@{
*/
/**
@brief Compute the logical bitwise OR of two fixed-point vectors.
@param[in] pSrcA points to input vector A
@param[in] pSrcB points to input vector B
@param[out] pDst points to output vector
@param[in] blockSize number of samples in each vector
@return none
*/
void arm_or_u16(
const uint16_t * pSrcA,
const uint16_t * pSrcB,
uint16_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
uint16x8_t vecSrcA, vecSrcB;
/* Compute 8 outputs at a time */
blkCnt = blockSize >> 3;
while (blkCnt > 0U)
{
vecSrcA = vld1q(pSrcA);
vecSrcB = vld1q(pSrcB);
vst1q(pDst, vorrq_u16(vecSrcA, vecSrcB) );
pSrcA += 8;
pSrcB += 8;
pDst += 8;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecSrcA = vld1q(pSrcA);
vecSrcB = vld1q(pSrcB);
vstrhq_p(pDst, vorrq_u16(vecSrcA, vecSrcB), p0);
}
#else
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
uint16x8_t vecA, vecB;
/* Compute 8 outputs at a time */
blkCnt = blockSize >> 3U;
while (blkCnt > 0U)
{
vecA = vld1q_u16(pSrcA);
vecB = vld1q_u16(pSrcB);
vst1q_u16(pDst, vorrq_u16(vecA, vecB) );
pSrcA += 8;
pSrcB += 8;
pDst += 8;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 7;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif
while (blkCnt > 0U)
{
*pDst++ = (*pSrcA++)|(*pSrcB++);
/* Decrement the loop counter */
blkCnt--;
}
#endif /* if defined(ARM_MATH_MVEI) */
}
/**
@} end of Or group
*/

View File

@@ -0,0 +1,128 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_or_u32.c
* Description: uint32_t bitwise inclusive OR
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup Or
@{
*/
/**
@brief Compute the logical bitwise OR of two fixed-point vectors.
@param[in] pSrcA points to input vector A
@param[in] pSrcB points to input vector B
@param[out] pDst points to output vector
@param[in] blockSize number of samples in each vector
@return none
*/
void arm_or_u32(
const uint32_t * pSrcA,
const uint32_t * pSrcB,
uint32_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
uint32x4_t vecSrcA, vecSrcB;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2;
while (blkCnt > 0U)
{
vecSrcA = vld1q(pSrcA);
vecSrcB = vld1q(pSrcB);
vst1q(pDst, vorrq_u32(vecSrcA, vecSrcB) );
pSrcA += 4;
pSrcB += 4;
pDst += 4;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
vecSrcA = vld1q(pSrcA);
vecSrcB = vld1q(pSrcB);
vstrwq_p(pDst, vorrq_u32(vecSrcA, vecSrcB), p0);
}
#else
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
uint32x4_t vecA, vecB;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
vecA = vld1q_u32(pSrcA);
vecB = vld1q_u32(pSrcB);
vst1q_u32(pDst, vorrq_u32(vecA, vecB) );
pSrcA += 4;
pSrcB += 4;
pDst += 4;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 3;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif
while (blkCnt > 0U)
{
*pDst++ = (*pSrcA++)|(*pSrcB++);
/* Decrement the loop counter */
blkCnt--;
}
#endif /* if defined(ARM_MATH_MVEI) */
}
/**
@} end of Or group
*/

View File

@@ -0,0 +1,128 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_or_u8.c
* Description: uint8_t bitwise inclusive OR
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup Or
@{
*/
/**
@brief Compute the logical bitwise OR of two fixed-point vectors.
@param[in] pSrcA points to input vector A
@param[in] pSrcB points to input vector B
@param[out] pDst points to output vector
@param[in] blockSize number of samples in each vector
@return none
*/
void arm_or_u8(
const uint8_t * pSrcA,
const uint8_t * pSrcB,
uint8_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
uint8x16_t vecSrcA, vecSrcB;
/* Compute 16 outputs at a time */
blkCnt = blockSize >> 4;
while (blkCnt > 0U)
{
vecSrcA = vld1q(pSrcA);
vecSrcB = vld1q(pSrcB);
vst1q(pDst, vorrq_u8(vecSrcA, vecSrcB) );
pSrcA += 16;
pSrcB += 16;
pDst += 16;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0xF;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp8q(blkCnt);
vecSrcA = vld1q(pSrcA);
vecSrcB = vld1q(pSrcB);
vstrbq_p(pDst, vorrq_u8(vecSrcA, vecSrcB), p0);
}
#else
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
uint8x16_t vecA, vecB;
/* Compute 16 outputs at a time */
blkCnt = blockSize >> 4U;
while (blkCnt > 0U)
{
vecA = vld1q_u8(pSrcA);
vecB = vld1q_u8(pSrcB);
vst1q_u8(pDst, vorrq_u8(vecA, vecB) );
pSrcA += 16;
pSrcB += 16;
pDst += 16;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0xF;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif
while (blkCnt > 0U)
{
*pDst++ = (*pSrcA++)|(*pSrcB++);
/* Decrement the loop counter */
blkCnt--;
}
#endif /* if defined(ARM_MATH_MVEI) */
}
/**
@} end of Or group
*/

View File

@@ -0,0 +1,183 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_scale_f16.c
* Description: Multiplies a floating-point vector by a scalar
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions_f16.h"
/**
@ingroup groupMath
*/
/**
@defgroup BasicScale Vector Scale
Multiply a vector by a scalar value. For floating-point data, the algorithm used is:
<pre>
pDst[n] = pSrc[n] * scale, 0 <= n < blockSize.
</pre>
In the fixed-point Q7, Q15, and Q31 functions, <code>scale</code> is represented by
a fractional multiplication <code>scaleFract</code> and an arithmetic shift <code>shift</code>.
The shift allows the gain of the scaling operation to exceed 1.0.
The algorithm used with fixed-point data is:
<pre>
pDst[n] = (pSrc[n] * scaleFract) << shift, 0 <= n < blockSize.
</pre>
The overall scale factor applied to the fixed-point data is
<pre>
scale = scaleFract * 2^shift.
</pre>
The functions support in-place computation allowing the source and destination
pointers to reference the same memory buffer.
*/
/**
@addtogroup BasicScale
@{
*/
/**
@brief Multiplies a floating-point vector by a scalar.
@param[in] pSrc points to the input vector
@param[in] scale scale factor to be applied
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_scale_f16(
const float16_t * pSrc,
float16_t scale,
float16_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
f16x8_t vec1;
f16x8_t res;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 3U;
while (blkCnt > 0U)
{
/* C = A + offset */
/* Add offset and then store the results in the destination buffer. */
vec1 = vld1q(pSrc);
res = vmulq(vec1,scale);
vst1q(pDst, res);
/* Increment pointers */
pSrc += 8;
pDst += 8;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0x7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vec1 = vld1q((float16_t const *) pSrc);
vstrhq_p(pDst, vmulq(vec1, scale), p0);
}
}
#else
#if defined(ARM_FLOAT16_SUPPORTED)
void arm_scale_f16(
const float16_t *pSrc,
float16_t scale,
float16_t *pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A * scale */
/* Scale input and store result in destination buffer. */
*pDst++ = (_Float16)(*pSrc++) * (_Float16)scale;
*pDst++ = (_Float16)(*pSrc++) * (_Float16)scale;
*pDst++ = (_Float16)(*pSrc++) * (_Float16)scale;
*pDst++ = (_Float16)(*pSrc++) * (_Float16)scale;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A * scale */
/* Scale input and store result in destination buffer. */
*pDst++ = (_Float16)(*pSrc++) * (_Float16)scale;
/* Decrement loop counter */
blkCnt--;
}
}
#endif
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of BasicScale group
*/

View File

@@ -0,0 +1,216 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_scale_f32.c
* Description: Multiplies a floating-point vector by a scalar
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@defgroup BasicScale Vector Scale
Multiply a vector by a scalar value. For floating-point data, the algorithm used is:
<pre>
pDst[n] = pSrc[n] * scale, 0 <= n < blockSize.
</pre>
In the fixed-point Q7, Q15, and Q31 functions, <code>scale</code> is represented by
a fractional multiplication <code>scaleFract</code> and an arithmetic shift <code>shift</code>.
The shift allows the gain of the scaling operation to exceed 1.0.
The algorithm used with fixed-point data is:
<pre>
pDst[n] = (pSrc[n] * scaleFract) << shift, 0 <= n < blockSize.
</pre>
The overall scale factor applied to the fixed-point data is
<pre>
scale = scaleFract * 2^shift.
</pre>
The functions support in-place computation allowing the source and destination
pointers to reference the same memory buffer.
*/
/**
@addtogroup BasicScale
@{
*/
/**
@brief Multiplies a floating-point vector by a scalar.
@param[in] pSrc points to the input vector
@param[in] scale scale factor to be applied
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_scale_f32(
const float32_t * pSrc,
float32_t scale,
float32_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
f32x4_t vec1;
f32x4_t res;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A + offset */
/* Add offset and then store the results in the destination buffer. */
vec1 = vld1q(pSrc);
res = vmulq(vec1,scale);
vst1q(pDst, res);
/* Increment pointers */
pSrc += 4;
pDst += 4;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0x3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
vec1 = vld1q((float32_t const *) pSrc);
vstrwq_p(pDst, vmulq(vec1, scale), p0);
}
}
#else
void arm_scale_f32(
const float32_t *pSrc,
float32_t scale,
float32_t *pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined(ARM_MATH_NEON_EXPERIMENTAL)
f32x4_t vec1;
f32x4_t res;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A * scale */
/* Scale the input and then store the results in the destination buffer. */
vec1 = vld1q_f32(pSrc);
res = vmulq_f32(vec1, vdupq_n_f32(scale));
vst1q_f32(pDst, res);
/* Increment pointers */
pSrc += 4;
pDst += 4;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0x3;
#else
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
float32_t in1, in2, in3, in4;
/* C = A * scale */
/* Scale input and store result in destination buffer. */
in1 = (*pSrc++) * scale;
in2 = (*pSrc++) * scale;
in3 = (*pSrc++) * scale;
in4 = (*pSrc++) * scale;
*pDst++ = in1;
*pDst++ = in2;
*pDst++ = in3;
*pDst++ = in4;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
#endif /* #if defined(ARM_MATH_NEON_EXPERIMENTAL) */
while (blkCnt > 0U)
{
/* C = A * scale */
/* Scale input and store result in destination buffer. */
*pDst++ = (*pSrc++) * scale;
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of BasicScale group
*/

View File

@@ -0,0 +1,75 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_scale_f64.c
* Description: Multiplies a floating-point vector by a scalar
*
* $Date: 13 September 2021
* $Revision: V1.10.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup BasicScale
@{
*/
/**
@brief Multiplies a floating-point vector by a scalar.
@param[in] pSrc points to the input vector
@param[in] scale scale factor to be applied
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
*/
void arm_scale_f64(
const float64_t *pSrc,
float64_t scale,
float64_t *pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* C = A * scale */
/* Scale input and store result in destination buffer. */
*pDst++ = (*pSrc++) * scale;
/* Decrement loop counter */
blkCnt--;
}
}
/**
@} end of BasicScale group
*/

View File

@@ -0,0 +1,201 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_scale_q15.c
* Description: Multiplies a Q15 vector by a scalar
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup BasicScale
@{
*/
/**
@brief Multiplies a Q15 vector by a scalar.
@param[in] pSrc points to the input vector
@param[in] scaleFract fractional portion of the scale value
@param[in] shift number of bits to shift the result by
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The input data <code>*pSrc</code> and <code>scaleFract</code> are in 1.15 format.
These are multiplied to yield a 2.30 intermediate result and this is shifted with saturation to 1.15 format.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_scale_q15(
const q15_t * pSrc,
q15_t scaleFract,
int8_t shift,
q15_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counters */
q15x8_t vecSrc;
q15x8_t vecDst;
/* Compute 8 outputs at a time */
blkCnt = blockSize >> 3;
while (blkCnt > 0U)
{
/*
* C = A * scale
* Scale the input and then store the result in the destination buffer.
*/
vecSrc = vld1q(pSrc);
vecDst = vmulhq(vecSrc, vdupq_n_s16(scaleFract));
vecDst = vqshlq_r(vecDst, shift + 1);
vst1q(pDst, vecDst);
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
/*
* advance vector source and destination pointers
*/
pSrc += 8;
pDst += 8;
}
/*
* tail
*/
blkCnt = blockSize & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);;
vecSrc = vld1q(pSrc);
vecDst = vmulhq(vecSrc, vdupq_n_s16(scaleFract));
vecDst = vqshlq_r(vecDst, shift + 1);
vstrhq_p(pDst, vecDst, p0);
}
}
#else
void arm_scale_q15(
const q15_t *pSrc,
q15_t scaleFract,
int8_t shift,
q15_t *pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
int8_t kShift = 15 - shift; /* Shift to apply after scaling */
#if defined (ARM_MATH_LOOPUNROLL)
#if defined (ARM_MATH_DSP)
q31_t inA1, inA2;
q31_t out1, out2, out3, out4; /* Temporary output variables */
q15_t in1, in2, in3, in4; /* Temporary input variables */
#endif
#endif
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A * scale */
#if defined (ARM_MATH_DSP)
/* read 2 times 2 samples at a time from source */
inA1 = read_q15x2_ia (&pSrc);
inA2 = read_q15x2_ia (&pSrc);
/* Scale inputs and store result in temporary variables
* in single cycle by packing the outputs */
out1 = (q31_t) ((q15_t) (inA1 >> 16) * scaleFract);
out2 = (q31_t) ((q15_t) (inA1 ) * scaleFract);
out3 = (q31_t) ((q15_t) (inA2 >> 16) * scaleFract);
out4 = (q31_t) ((q15_t) (inA2 ) * scaleFract);
/* apply shifting */
out1 = out1 >> kShift;
out2 = out2 >> kShift;
out3 = out3 >> kShift;
out4 = out4 >> kShift;
/* saturate the output */
in1 = (q15_t) (__SSAT(out1, 16));
in2 = (q15_t) (__SSAT(out2, 16));
in3 = (q15_t) (__SSAT(out3, 16));
in4 = (q15_t) (__SSAT(out4, 16));
/* store result to destination */
write_q15x2_ia (&pDst, __PKHBT(in2, in1, 16));
write_q15x2_ia (&pDst, __PKHBT(in4, in3, 16));
#else
*pDst++ = (q15_t) (__SSAT(((q31_t) *pSrc++ * scaleFract) >> kShift, 16));
*pDst++ = (q15_t) (__SSAT(((q31_t) *pSrc++ * scaleFract) >> kShift, 16));
*pDst++ = (q15_t) (__SSAT(((q31_t) *pSrc++ * scaleFract) >> kShift, 16));
*pDst++ = (q15_t) (__SSAT(((q31_t) *pSrc++ * scaleFract) >> kShift, 16));
#endif
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A * scale */
/* Scale input and store result in destination buffer. */
*pDst++ = (q15_t) (__SSAT(((q31_t) *pSrc++ * scaleFract) >> kShift, 16));
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of BasicScale group
*/

View File

@@ -0,0 +1,244 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_scale_q31.c
* Description: Multiplies a Q31 vector by a scalar
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup BasicScale
@{
*/
/**
@brief Multiplies a Q31 vector by a scalar.
@param[in] pSrc points to the input vector
@param[in] scaleFract fractional portion of the scale value
@param[in] shift number of bits to shift the result by
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The input data <code>*pSrc</code> and <code>scaleFract</code> are in 1.31 format.
These are multiplied to yield a 2.62 intermediate result and this is shifted with saturation to 1.31 format.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_scale_q31(
const q31_t * pSrc,
q31_t scaleFract,
int8_t shift,
q31_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counters */
q31x4_t vecSrc;
q31x4_t vecDst;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2;
while (blkCnt > 0U)
{
/*
* C = A * scale
* Scale the input and then store the result in the destination buffer.
*/
vecSrc = vld1q(pSrc);
vecDst = vmulhq(vecSrc, vdupq_n_s32(scaleFract));
vecDst = vqshlq_r(vecDst, shift + 1);
vst1q(pDst, vecDst);
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
/*
* advance vector source and destination pointers
*/
pSrc += 4;
pDst += 4;
}
/*
* tail
*/
blkCnt = blockSize & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
vecSrc = vld1q(pSrc);
vecDst = vmulhq(vecSrc, vdupq_n_s32(scaleFract));
vecDst = vqshlq_r(vecDst, shift + 1);
vstrwq_p(pDst, vecDst, p0);
}
}
#else
void arm_scale_q31(
const q31_t *pSrc,
q31_t scaleFract,
int8_t shift,
q31_t *pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
q31_t in, out; /* Temporary variables */
int8_t kShift = shift + 1; /* Shift to apply after scaling */
int8_t sign = (kShift & 0x80);
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
if (sign == 0U)
{
while (blkCnt > 0U)
{
/* C = A * scale */
/* Scale input and store result in destination buffer. */
in = *pSrc++; /* read input from source */
in = ((q63_t) in * scaleFract) >> 32; /* multiply input with scaler value */
out = in << kShift; /* apply shifting */
if (in != (out >> kShift)) /* saturate the result */
out = 0x7FFFFFFF ^ (in >> 31);
*pDst++ = out; /* Store result destination */
in = *pSrc++;
in = ((q63_t) in * scaleFract) >> 32;
out = in << kShift;
if (in != (out >> kShift))
out = 0x7FFFFFFF ^ (in >> 31);
*pDst++ = out;
in = *pSrc++;
in = ((q63_t) in * scaleFract) >> 32;
out = in << kShift;
if (in != (out >> kShift))
out = 0x7FFFFFFF ^ (in >> 31);
*pDst++ = out;
in = *pSrc++;
in = ((q63_t) in * scaleFract) >> 32;
out = in << kShift;
if (in != (out >> kShift))
out = 0x7FFFFFFF ^ (in >> 31);
*pDst++ = out;
/* Decrement loop counter */
blkCnt--;
}
}
else
{
while (blkCnt > 0U)
{
/* C = A * scale */
/* Scale input and store result in destination buffer. */
in = *pSrc++; /* read four inputs from source */
in = ((q63_t) in * scaleFract) >> 32; /* multiply input with scaler value */
out = in >> -kShift; /* apply shifting */
*pDst++ = out; /* Store result destination */
in = *pSrc++;
in = ((q63_t) in * scaleFract) >> 32;
out = in >> -kShift;
*pDst++ = out;
in = *pSrc++;
in = ((q63_t) in * scaleFract) >> 32;
out = in >> -kShift;
*pDst++ = out;
in = *pSrc++;
in = ((q63_t) in * scaleFract) >> 32;
out = in >> -kShift;
*pDst++ = out;
/* Decrement loop counter */
blkCnt--;
}
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
if (sign == 0U)
{
while (blkCnt > 0U)
{
/* C = A * scale */
/* Scale input and store result in destination buffer. */
in = *pSrc++;
in = ((q63_t) in * scaleFract) >> 32;
out = in << kShift;
if (in != (out >> kShift))
out = 0x7FFFFFFF ^ (in >> 31);
*pDst++ = out;
/* Decrement loop counter */
blkCnt--;
}
}
else
{
while (blkCnt > 0U)
{
/* C = A * scale */
/* Scale input and store result in destination buffer. */
in = *pSrc++;
in = ((q63_t) in * scaleFract) >> 32;
out = in >> -kShift;
*pDst++ = out;
/* Decrement loop counter */
blkCnt--;
}
}
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of BasicScale group
*/

View File

@@ -0,0 +1,186 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_scale_q7.c
* Description: Multiplies a Q7 vector by a scalar
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup BasicScale
@{
*/
/**
@brief Multiplies a Q7 vector by a scalar.
@param[in] pSrc points to the input vector
@param[in] scaleFract fractional portion of the scale value
@param[in] shift number of bits to shift the result by
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The input data <code>*pSrc</code> and <code>scaleFract</code> are in 1.7 format.
These are multiplied to yield a 2.14 intermediate result and this is shifted with saturation to 1.7 format.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_scale_q7(
const q7_t * pSrc,
q7_t scaleFract,
int8_t shift,
q7_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counters */
q7x16_t vecSrc;
q7x16_t vecDst;
/* Compute 16 outputs at a time */
blkCnt = blockSize >> 4;
while (blkCnt > 0U)
{
/*
* C = A * scale
* Scale the input and then store the result in the destination buffer.
*/
vecSrc = vld1q(pSrc);
vecDst = vmulhq(vecSrc, vdupq_n_s8(scaleFract));
vecDst = vqshlq_r(vecDst, shift + 1);
vst1q(pDst, vecDst);
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
/*
* advance vector source and destination pointers
*/
pSrc += 16;
pDst += 16;
}
/*
* tail
*/
blkCnt = blockSize & 0xF;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp8q(blkCnt);
vecSrc = vld1q(pSrc);
vecDst = vmulhq(vecSrc, vdupq_n_s8(scaleFract));
vecDst = vqshlq_r(vecDst, shift + 1);
vstrbq_p(pDst, vecDst, p0);
}
}
#else
void arm_scale_q7(
const q7_t * pSrc,
q7_t scaleFract,
int8_t shift,
q7_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
int8_t kShift = 7 - shift; /* Shift to apply after scaling */
#if defined (ARM_MATH_LOOPUNROLL)
#if defined (ARM_MATH_DSP)
q7_t in1, in2, in3, in4; /* Temporary input variables */
q7_t out1, out2, out3, out4; /* Temporary output variables */
#endif
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A * scale */
#if defined (ARM_MATH_DSP)
/* Reading 4 inputs from memory */
in1 = *pSrc++;
in2 = *pSrc++;
in3 = *pSrc++;
in4 = *pSrc++;
/* Scale inputs and store result in the temporary variable. */
out1 = (q7_t) (__SSAT(((in1) * scaleFract) >> kShift, 8));
out2 = (q7_t) (__SSAT(((in2) * scaleFract) >> kShift, 8));
out3 = (q7_t) (__SSAT(((in3) * scaleFract) >> kShift, 8));
out4 = (q7_t) (__SSAT(((in4) * scaleFract) >> kShift, 8));
/* Pack and store result in destination buffer (in single write) */
write_q7x4_ia (&pDst, __PACKq7(out1, out2, out3, out4));
#else
*pDst++ = (q7_t) (__SSAT((((q15_t) *pSrc++ * scaleFract) >> kShift), 8));
*pDst++ = (q7_t) (__SSAT((((q15_t) *pSrc++ * scaleFract) >> kShift), 8));
*pDst++ = (q7_t) (__SSAT((((q15_t) *pSrc++ * scaleFract) >> kShift), 8));
*pDst++ = (q7_t) (__SSAT((((q15_t) *pSrc++ * scaleFract) >> kShift), 8));
#endif
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A * scale */
/* Scale input and store result in destination buffer. */
*pDst++ = (q7_t) (__SSAT((((q15_t) *pSrc++ * scaleFract) >> kShift), 8));
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of BasicScale group
*/

View File

@@ -0,0 +1,251 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_shift_q15.c
* Description: Shifts the elements of a Q15 vector by a specified number of bits
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup BasicShift
@{
*/
/**
@brief Shifts the elements of a Q15 vector a specified number of bits
@param[in] pSrc points to the input vector
@param[in] shiftBits number of bits to shift. A positive value shifts left; a negative value shifts right.
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_shift_q15(
const q15_t * pSrc,
int8_t shiftBits,
q15_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counters */
q15x8_t vecSrc;
q15x8_t vecDst;
/* Compute 8 outputs at a time */
blkCnt = blockSize >> 3;
while (blkCnt > 0U)
{
/*
* C = A (>> or <<) shiftBits
* Shift the input and then store the result in the destination buffer.
*/
vecSrc = vld1q(pSrc);
vecDst = vqshlq_r(vecSrc, shiftBits);
vst1q(pDst, vecDst);
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
/*
* advance vector source and destination pointers
*/
pSrc += 8;
pDst += 8;
}
/*
* tail
*/
blkCnt = blockSize & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecSrc = vld1q(pSrc);
vecDst = vqshlq_r(vecSrc, shiftBits);
vstrhq_p(pDst, vecDst, p0);
}
}
#else
void arm_shift_q15(
const q15_t * pSrc,
int8_t shiftBits,
q15_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
uint8_t sign = (shiftBits & 0x80); /* Sign of shiftBits */
#if defined (ARM_MATH_LOOPUNROLL)
#if defined (ARM_MATH_DSP)
q15_t in1, in2; /* Temporary input variables */
#endif
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* If the shift value is positive then do right shift else left shift */
if (sign == 0U)
{
while (blkCnt > 0U)
{
/* C = A << shiftBits */
#if defined (ARM_MATH_DSP)
/* read 2 samples from source */
in1 = *pSrc++;
in2 = *pSrc++;
/* Shift the inputs and then store the results in the destination buffer. */
#ifndef ARM_MATH_BIG_ENDIAN
write_q15x2_ia (&pDst, __PKHBT(__SSAT(((q31_t) in1 << shiftBits), 16),
__SSAT(((q31_t) in2 << shiftBits), 16), 16));
#else
write_q15x2_ia (&pDst, __PKHBT(__SSAT(((q31_t) in2 << shiftBits), 16),
__SSAT(((q31_t) in1 << shiftBits), 16), 16));
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* read 2 samples from source */
in1 = *pSrc++;
in2 = *pSrc++;
#ifndef ARM_MATH_BIG_ENDIAN
write_q15x2_ia (&pDst, __PKHBT(__SSAT(((q31_t) in1 << shiftBits), 16),
__SSAT(((q31_t) in2 << shiftBits), 16), 16));
#else
write_q15x2_ia (&pDst, __PKHBT(__SSAT(((q31_t) in2 << shiftBits), 16),
__SSAT(((q31_t) in1 << shiftBits), 16), 16));
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
#else
*pDst++ = __SSAT(((q31_t) *pSrc++ << shiftBits), 16);
*pDst++ = __SSAT(((q31_t) *pSrc++ << shiftBits), 16);
*pDst++ = __SSAT(((q31_t) *pSrc++ << shiftBits), 16);
*pDst++ = __SSAT(((q31_t) *pSrc++ << shiftBits), 16);
#endif
/* Decrement loop counter */
blkCnt--;
}
}
else
{
while (blkCnt > 0U)
{
/* C = A >> shiftBits */
#if defined (ARM_MATH_DSP)
/* read 2 samples from source */
in1 = *pSrc++;
in2 = *pSrc++;
/* Shift the inputs and then store the results in the destination buffer. */
#ifndef ARM_MATH_BIG_ENDIAN
write_q15x2_ia (&pDst, __PKHBT((in1 >> -shiftBits),
(in2 >> -shiftBits), 16));
#else
write_q15x2_ia (&pDst, __PKHBT((in2 >> -shiftBits),
(in1 >> -shiftBits), 16));
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* read 2 samples from source */
in1 = *pSrc++;
in2 = *pSrc++;
#ifndef ARM_MATH_BIG_ENDIAN
write_q15x2_ia (&pDst, __PKHBT((in1 >> -shiftBits),
(in2 >> -shiftBits), 16));
#else
write_q15x2_ia (&pDst, __PKHBT((in2 >> -shiftBits),
(in1 >> -shiftBits), 16));
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
#else
*pDst++ = (*pSrc++ >> -shiftBits);
*pDst++ = (*pSrc++ >> -shiftBits);
*pDst++ = (*pSrc++ >> -shiftBits);
*pDst++ = (*pSrc++ >> -shiftBits);
#endif
/* Decrement loop counter */
blkCnt--;
}
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
/* If the shift value is positive then do right shift else left shift */
if (sign == 0U)
{
while (blkCnt > 0U)
{
/* C = A << shiftBits */
/* Shift input and store result in destination buffer. */
*pDst++ = __SSAT(((q31_t) *pSrc++ << shiftBits), 16);
/* Decrement loop counter */
blkCnt--;
}
}
else
{
while (blkCnt > 0U)
{
/* C = A >> shiftBits */
/* Shift input and store result in destination buffer. */
*pDst++ = (*pSrc++ >> -shiftBits);
/* Decrement loop counter */
blkCnt--;
}
}
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of BasicShift group
*/

View File

@@ -0,0 +1,232 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_shift_q31.c
* Description: Shifts the elements of a Q31 vector by a specified number of bits
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@defgroup BasicShift Vector Shift
Shifts the elements of a fixed-point vector by a specified number of bits.
There are separate functions for Q7, Q15, and Q31 data types.
The underlying algorithm used is:
<pre>
pDst[n] = pSrc[n] << shift, 0 <= n < blockSize.
</pre>
If <code>shift</code> is positive then the elements of the vector are shifted to the left.
If <code>shift</code> is negative then the elements of the vector are shifted to the right.
The functions support in-place computation allowing the source and destination
pointers to reference the same memory buffer.
*/
/**
@addtogroup BasicShift
@{
*/
/**
@brief Shifts the elements of a Q31 vector a specified number of bits.
@param[in] pSrc points to the input vector
@param[in] shiftBits number of bits to shift. A positive value shifts left; a negative value shifts right.
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in the vector
@return none
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_shift_q31(
const q31_t * pSrc,
int8_t shiftBits,
q31_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counters */
q31x4_t vecSrc;
q31x4_t vecDst;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2;
while (blkCnt > 0U)
{
/*
* C = A (>> or <<) shiftBits
* Shift the input and then store the result in the destination buffer.
*/
vecSrc = vld1q((q31_t const *) pSrc);
vecDst = vqshlq_r(vecSrc, shiftBits);
vst1q(pDst, vecDst);
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
/*
* advance vector source and destination pointers
*/
pSrc += 4;
pDst += 4;
}
/*
* tail
*/
blkCnt = blockSize & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
vecSrc = vld1q((q31_t const *) pSrc);
vecDst = vqshlq_r(vecSrc, shiftBits);
vstrwq_p(pDst, vecDst, p0);
}
}
#else
void arm_shift_q31(
const q31_t * pSrc,
int8_t shiftBits,
q31_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
uint8_t sign = (shiftBits & 0x80); /* Sign of shiftBits */
#if defined (ARM_MATH_LOOPUNROLL)
q31_t in, out; /* Temporary variables */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* If the shift value is positive then do right shift else left shift */
if (sign == 0U)
{
while (blkCnt > 0U)
{
/* C = A << shiftBits */
/* Shift input and store result in destination buffer. */
in = *pSrc++;
out = in << shiftBits;
if (in != (out >> shiftBits))
out = 0x7FFFFFFF ^ (in >> 31);
*pDst++ = out;
in = *pSrc++;
out = in << shiftBits;
if (in != (out >> shiftBits))
out = 0x7FFFFFFF ^ (in >> 31);
*pDst++ = out;
in = *pSrc++;
out = in << shiftBits;
if (in != (out >> shiftBits))
out = 0x7FFFFFFF ^ (in >> 31);
*pDst++ = out;
in = *pSrc++;
out = in << shiftBits;
if (in != (out >> shiftBits))
out = 0x7FFFFFFF ^ (in >> 31);
*pDst++ = out;
/* Decrement loop counter */
blkCnt--;
}
}
else
{
while (blkCnt > 0U)
{
/* C = A >> shiftBits */
/* Shift input and store results in destination buffer. */
*pDst++ = (*pSrc++ >> -shiftBits);
*pDst++ = (*pSrc++ >> -shiftBits);
*pDst++ = (*pSrc++ >> -shiftBits);
*pDst++ = (*pSrc++ >> -shiftBits);
/* Decrement loop counter */
blkCnt--;
}
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
/* If the shift value is positive then do right shift else left shift */
if (sign == 0U)
{
while (blkCnt > 0U)
{
/* C = A << shiftBits */
/* Shift input and store result in destination buffer. */
*pDst++ = clip_q63_to_q31((q63_t) *pSrc++ << shiftBits);
/* Decrement loop counter */
blkCnt--;
}
}
else
{
while (blkCnt > 0U)
{
/* C = A >> shiftBits */
/* Shift input and store result in destination buffer. */
*pDst++ = (*pSrc++ >> -shiftBits);
/* Decrement loop counter */
blkCnt--;
}
}
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of BasicShift group
*/

View File

@@ -0,0 +1,225 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_shift_q7.c
* Description: Processing function for the Q7 Shifting
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup BasicShift
@{
*/
/**
@brief Shifts the elements of a Q7 vector a specified number of bits
@param[in] pSrc points to the input vector
@param[in] shiftBits number of bits to shift. A positive value shifts left; a negative value shifts right.
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
@par onditions for optimum performance
Input and output buffers should be aligned by 32-bit
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q7 range [0x80 0x7F] are saturated.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_shift_q7(
const q7_t * pSrc,
int8_t shiftBits,
q7_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counters */
q7x16_t vecSrc;
q7x16_t vecDst;
/* Compute 16 outputs at a time */
blkCnt = blockSize >> 4;
while (blkCnt > 0U)
{
/*
* C = A (>> or <<) shiftBits
* Shift the input and then store the result in the destination buffer.
*/
vecSrc = vld1q(pSrc);
vecDst = vqshlq_r(vecSrc, shiftBits);
vst1q(pDst, vecDst);
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
/*
* advance vector source and destination pointers
*/
pSrc += 16;
pDst += 16;
}
/*
* tail
*/
blkCnt = blockSize & 0xF;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp8q(blkCnt);
vecSrc = vld1q(pSrc);
vecDst = vqshlq_r(vecSrc, shiftBits);
vstrbq_p(pDst, vecDst, p0);
}
}
#else
void arm_shift_q7(
const q7_t * pSrc,
int8_t shiftBits,
q7_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
uint8_t sign = (shiftBits & 0x80); /* Sign of shiftBits */
#if defined (ARM_MATH_LOOPUNROLL)
#if defined (ARM_MATH_DSP)
q7_t in1, in2, in3, in4; /* Temporary input variables */
#endif
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* If the shift value is positive then do right shift else left shift */
if (sign == 0U)
{
while (blkCnt > 0U)
{
/* C = A << shiftBits */
#if defined (ARM_MATH_DSP)
/* Read 4 inputs */
in1 = *pSrc++;
in2 = *pSrc++;
in3 = *pSrc++;
in4 = *pSrc++;
/* Pack and store result in destination buffer (in single write) */
write_q7x4_ia (&pDst, __PACKq7(__SSAT(((q15_t) in1 << shiftBits), 8),
__SSAT(((q15_t) in2 << shiftBits), 8),
__SSAT(((q15_t) in3 << shiftBits), 8),
__SSAT(((q15_t) in4 << shiftBits), 8) ));
#else
*pDst++ = (q7_t) __SSAT(((q15_t) *pSrc++ << shiftBits), 8);
*pDst++ = (q7_t) __SSAT(((q15_t) *pSrc++ << shiftBits), 8);
*pDst++ = (q7_t) __SSAT(((q15_t) *pSrc++ << shiftBits), 8);
*pDst++ = (q7_t) __SSAT(((q15_t) *pSrc++ << shiftBits), 8);
#endif
/* Decrement loop counter */
blkCnt--;
}
}
else
{
while (blkCnt > 0U)
{
/* C = A >> shiftBits */
#if defined (ARM_MATH_DSP)
/* Read 4 inputs */
in1 = *pSrc++;
in2 = *pSrc++;
in3 = *pSrc++;
in4 = *pSrc++;
/* Pack and store result in destination buffer (in single write) */
write_q7x4_ia (&pDst, __PACKq7((in1 >> -shiftBits),
(in2 >> -shiftBits),
(in3 >> -shiftBits),
(in4 >> -shiftBits) ));
#else
*pDst++ = (*pSrc++ >> -shiftBits);
*pDst++ = (*pSrc++ >> -shiftBits);
*pDst++ = (*pSrc++ >> -shiftBits);
*pDst++ = (*pSrc++ >> -shiftBits);
#endif
/* Decrement loop counter */
blkCnt--;
}
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
/* If the shift value is positive then do right shift else left shift */
if (sign == 0U)
{
while (blkCnt > 0U)
{
/* C = A << shiftBits */
/* Shift input and store result in destination buffer. */
*pDst++ = (q7_t) __SSAT(((q15_t) *pSrc++ << shiftBits), 8);
/* Decrement loop counter */
blkCnt--;
}
}
else
{
while (blkCnt > 0U)
{
/* C = A >> shiftBits */
/* Shift input and store result in destination buffer. */
*pDst++ = (*pSrc++ >> -shiftBits);
/* Decrement loop counter */
blkCnt--;
}
}
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of BasicShift group
*/

View File

@@ -0,0 +1,171 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_sub_f16.c
* Description: Floating-point vector subtraction
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions_f16.h"
/**
@ingroup groupMath
*/
/**
@defgroup BasicSub Vector Subtraction
Element-by-element subtraction of two vectors.
<pre>
pDst[n] = pSrcA[n] - pSrcB[n], 0 <= n < blockSize.
</pre>
There are separate functions for floating-point, Q7, Q15, and Q31 data types.
*/
/**
@addtogroup BasicSub
@{
*/
/**
@brief Floating-point vector subtraction.
@param[in] pSrcA points to the first input vector
@param[in] pSrcB points to the second input vector
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_sub_f16(
const float16_t * pSrcA,
const float16_t * pSrcB,
float16_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
f16x8_t vec1;
f16x8_t vec2;
f16x8_t res;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 3U;
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and then store the results in the destination buffer. */
vec1 = vld1q(pSrcA);
vec2 = vld1q(pSrcB);
res = vsubq(vec1, vec2);
vst1q(pDst, res);
/* Increment pointers */
pSrcA += 8;
pSrcB += 8;
pDst += 8;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0x7;
if (blkCnt > 0U)
{
/* C = A + B */
mve_pred16_t p0 = vctp16q(blkCnt);
vec1 = vld1q(pSrcA);
vec2 = vld1q(pSrcB);
vstrhq_p(pDst, vsubq(vec1,vec2), p0);
}
}
#else
#if defined(ARM_FLOAT16_SUPPORTED)
void arm_sub_f16(
const float16_t * pSrcA,
const float16_t * pSrcB,
float16_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A - B */
/* Subtract and store result in destination buffer. */
*pDst++ = (_Float16)(*pSrcA++) - (_Float16)(*pSrcB++);
*pDst++ = (_Float16)(*pSrcA++) - (_Float16)(*pSrcB++);
*pDst++ = (_Float16)(*pSrcA++) - (_Float16)(*pSrcB++);
*pDst++ = (_Float16)(*pSrcA++) - (_Float16)(*pSrcB++);
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A - B */
/* Subtract and store result in destination buffer. */
*pDst++ = (_Float16)(*pSrcA++) - (_Float16)(*pSrcB++);
/* Decrement loop counter */
blkCnt--;
}
}
#endif
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of BasicSub group
*/

View File

@@ -0,0 +1,202 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_sub_f32.c
* Description: Floating-point vector subtraction
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@defgroup BasicSub Vector Subtraction
Element-by-element subtraction of two vectors.
<pre>
pDst[n] = pSrcA[n] - pSrcB[n], 0 <= n < blockSize.
</pre>
There are separate functions for floating-point, Q7, Q15, and Q31 data types.
*/
/**
@addtogroup BasicSub
@{
*/
/**
@brief Floating-point vector subtraction.
@param[in] pSrcA points to the first input vector
@param[in] pSrcB points to the second input vector
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_sub_f32(
const float32_t * pSrcA,
const float32_t * pSrcB,
float32_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
f32x4_t vec1;
f32x4_t vec2;
f32x4_t res;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and then store the results in the destination buffer. */
vec1 = vld1q(pSrcA);
vec2 = vld1q(pSrcB);
res = vsubq(vec1, vec2);
vst1q(pDst, res);
/* Increment pointers */
pSrcA += 4;
pSrcB += 4;
pDst += 4;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0x3;
if (blkCnt > 0U)
{
/* C = A + B */
mve_pred16_t p0 = vctp32q(blkCnt);
vec1 = vld1q(pSrcA);
vec2 = vld1q(pSrcB);
vstrwq_p(pDst, vsubq(vec1,vec2), p0);
}
}
#else
void arm_sub_f32(
const float32_t * pSrcA,
const float32_t * pSrcB,
float32_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
f32x4_t vec1;
f32x4_t vec2;
f32x4_t res;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A - B */
/* Subtract and then store the results in the destination buffer. */
vec1 = vld1q_f32(pSrcA);
vec2 = vld1q_f32(pSrcB);
res = vsubq_f32(vec1, vec2);
vst1q_f32(pDst, res);
/* Increment pointers */
pSrcA += 4;
pSrcB += 4;
pDst += 4;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0x3;
#else
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A - B */
/* Subtract and store result in destination buffer. */
*pDst++ = (*pSrcA++) - (*pSrcB++);
*pDst++ = (*pSrcA++) - (*pSrcB++);
*pDst++ = (*pSrcA++) - (*pSrcB++);
*pDst++ = (*pSrcA++) - (*pSrcB++);
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
#endif /* #if defined(ARM_MATH_NEON) */
while (blkCnt > 0U)
{
/* C = A - B */
/* Subtract and store result in destination buffer. */
*pDst++ = (*pSrcA++) - (*pSrcB++);
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of BasicSub group
*/

View File

@@ -0,0 +1,75 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_sub_f64.c
* Description: Floating-point vector subtraction
*
* $Date: 13 September 2021
* $Revision: V1.10.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup BasicSub
@{
*/
/**
@brief Floating-point vector subtraction.
@param[in] pSrcA points to the first input vector
@param[in] pSrcB points to the second input vector
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
*/
void arm_sub_f64(
const float64_t * pSrcA,
const float64_t * pSrcB,
float64_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* C = A - B */
/* Subtract and store result in destination buffer. */
*pDst++ = (*pSrcA++) - (*pSrcB++);
/* Decrement loop counter */
blkCnt--;
}
}
/**
@} end of BasicSub group
*/

View File

@@ -0,0 +1,178 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_sub_q15.c
* Description: Q15 vector subtraction
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup BasicSub
@{
*/
/**
@brief Q15 vector subtraction.
@param[in] pSrcA points to the first input vector
@param[in] pSrcB points to the second input vector
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_sub_q15(
const q15_t * pSrcA,
const q15_t * pSrcB,
q15_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counters */
q15x8_t vecA;
q15x8_t vecB;
/* Compute 8 outputs at a time */
blkCnt = blockSize >> 3;
while (blkCnt > 0U)
{
/*
* C = A - B
* Subtract and then store the results in the destination buffer.
*/
vecA = vld1q(pSrcA);
vecB = vld1q(pSrcB);
vst1q(pDst, vqsubq(vecA, vecB));
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
/*
* advance vector source and destination pointers
*/
pSrcA += 8;
pSrcB += 8;
pDst += 8;
}
/*
* tail
*/
blkCnt = blockSize & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecA = vld1q(pSrcA);
vecB = vld1q(pSrcB);
vstrhq_p(pDst, vqsubq(vecA, vecB), p0);
}
}
#else
void arm_sub_q15(
const q15_t * pSrcA,
const q15_t * pSrcB,
q15_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined (ARM_MATH_LOOPUNROLL)
#if defined (ARM_MATH_DSP)
q31_t inA1, inA2;
q31_t inB1, inB2;
#endif
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A - B */
#if defined (ARM_MATH_DSP)
/* read 2 times 2 samples at a time from sourceA */
inA1 = read_q15x2_ia (&pSrcA);
inA2 = read_q15x2_ia (&pSrcA);
/* read 2 times 2 samples at a time from sourceB */
inB1 = read_q15x2_ia (&pSrcB);
inB2 = read_q15x2_ia (&pSrcB);
/* Subtract and store 2 times 2 samples at a time */
write_q15x2_ia (&pDst, __QSUB16(inA1, inB1));
write_q15x2_ia (&pDst, __QSUB16(inA2, inB2));
#else
*pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ - *pSrcB++), 16);
*pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ - *pSrcB++), 16);
*pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ - *pSrcB++), 16);
*pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ - *pSrcB++), 16);
#endif
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A - B */
/* Subtract and store result in destination buffer. */
#if defined (ARM_MATH_DSP)
*pDst++ = (q15_t) __QSUB16(*pSrcA++, *pSrcB++);
#else
*pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ - *pSrcB++), 16);
#endif
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of BasicSub group
*/

View File

@@ -0,0 +1,159 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_sub_q31.c
* Description: Q31 vector subtraction
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup BasicSub
@{
*/
/**
@brief Q31 vector subtraction.
@param[in] pSrcA points to the first input vector
@param[in] pSrcB points to the second input vector
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_sub_q31(
const q31_t * pSrcA,
const q31_t * pSrcB,
q31_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt;
q31x4_t vecA;
q31x4_t vecB;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2;
while (blkCnt > 0U)
{
/*
* C = A + B
* Add and then store the results in the destination buffer.
*/
vecA = vld1q(pSrcA);
vecB = vld1q(pSrcB);
vst1q(pDst, vqsubq(vecA, vecB));
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
/*
* advance vector source and destination pointers
*/
pSrcA += 4;
pSrcB += 4;
pDst += 4;
}
/*
* tail
*/
blkCnt = blockSize & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
vecA = vld1q(pSrcA);
vecB = vld1q(pSrcB);
vstrwq_p(pDst, vqsubq(vecA, vecB), p0);
}
}
#else
void arm_sub_q31(
const q31_t * pSrcA,
const q31_t * pSrcB,
q31_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A - B */
/* Subtract and store result in destination buffer. */
*pDst++ = __QSUB(*pSrcA++, *pSrcB++);
*pDst++ = __QSUB(*pSrcA++, *pSrcB++);
*pDst++ = __QSUB(*pSrcA++, *pSrcB++);
*pDst++ = __QSUB(*pSrcA++, *pSrcB++);
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A - B */
/* Subtract and store result in destination buffer. */
*pDst++ = __QSUB(*pSrcA++, *pSrcB++);
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of BasicSub group
*/

View File

@@ -0,0 +1,158 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_sub_q7.c
* Description: Q7 vector subtraction
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup BasicSub
@{
*/
/**
@brief Q7 vector subtraction.
@param[in] pSrcA points to the first input vector
@param[in] pSrcB points to the second input vector
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q7 range [0x80 0x7F] will be saturated.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_sub_q7(
const q7_t * pSrcA,
const q7_t * pSrcB,
q7_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counters */
q7x16_t vecA;
q7x16_t vecB;
/* Compute 16 outputs at a time */
blkCnt = blockSize >> 4;
while (blkCnt > 0U)
{
/*
* C = A - B
* Subtract and then store the results in the destination buffer.
*/
vecA = vld1q(pSrcA);
vecB = vld1q(pSrcB);
vst1q(pDst, vqsubq(vecA, vecB));
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
/*
* advance vector source and destination pointers
*/
pSrcA += 16;
pSrcB += 16;
pDst += 16;
}
/*
* tail
*/
blkCnt = blockSize & 0xF;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp8q(blkCnt);
vecA = vld1q(pSrcA);
vecB = vld1q(pSrcB);
vstrbq_p(pDst, vqsubq(vecA, vecB), p0);
}
}
#else
void arm_sub_q7(
const q7_t * pSrcA,
const q7_t * pSrcB,
q7_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A - B */
#if defined (ARM_MATH_DSP)
/* Subtract and store result in destination buffer (4 samples at a time). */
write_q7x4_ia (&pDst, __QSUB8(read_q7x4_ia (&pSrcA), read_q7x4_ia (&pSrcB)));
#else
*pDst++ = (q7_t) __SSAT((q15_t) *pSrcA++ - *pSrcB++, 8);
*pDst++ = (q7_t) __SSAT((q15_t) *pSrcA++ - *pSrcB++, 8);
*pDst++ = (q7_t) __SSAT((q15_t) *pSrcA++ - *pSrcB++, 8);
*pDst++ = (q7_t) __SSAT((q15_t) *pSrcA++ - *pSrcB++, 8);
#endif
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A - B */
/* Subtract and store result in destination buffer. */
*pDst++ = (q7_t) __SSAT((q15_t) *pSrcA++ - *pSrcB++, 8);
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of BasicSub group
*/

View File

@@ -0,0 +1,137 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_xor_u16.c
* Description: uint16_t bitwise exclusive OR
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@defgroup Xor Vector bitwise exclusive OR
Compute the logical bitwise XOR.
There are separate functions for uint32_t, uint16_t, and uint8_t data types.
*/
/**
@addtogroup Xor
@{
*/
/**
@brief Compute the logical bitwise XOR of two fixed-point vectors.
@param[in] pSrcA points to input vector A
@param[in] pSrcB points to input vector B
@param[out] pDst points to output vector
@param[in] blockSize number of samples in each vector
@return none
*/
void arm_xor_u16(
const uint16_t * pSrcA,
const uint16_t * pSrcB,
uint16_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
uint16x8_t vecSrcA, vecSrcB;
/* Compute 8 outputs at a time */
blkCnt = blockSize >> 3;
while (blkCnt > 0U)
{
vecSrcA = vld1q(pSrcA);
vecSrcB = vld1q(pSrcB);
vst1q(pDst, veorq_u16(vecSrcA, vecSrcB) );
pSrcA += 8;
pSrcB += 8;
pDst += 8;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecSrcA = vld1q(pSrcA);
vecSrcB = vld1q(pSrcB);
vstrhq_p(pDst, veorq_u16(vecSrcA, vecSrcB), p0);
}
#else
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
uint16x8_t vecA, vecB;
/* Compute 8 outputs at a time */
blkCnt = blockSize >> 3U;
while (blkCnt > 0U)
{
vecA = vld1q_u16(pSrcA);
vecB = vld1q_u16(pSrcB);
vst1q_u16(pDst, veorq_u16(vecA, vecB) );
pSrcA += 8;
pSrcB += 8;
pDst += 8;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 7;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif
while (blkCnt > 0U)
{
*pDst++ = (*pSrcA++)^(*pSrcB++);
/* Decrement the loop counter */
blkCnt--;
}
#endif /* if defined(ARM_MATH_MVEI) */
}
/**
@} end of Xor group
*/

View File

@@ -0,0 +1,129 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_xor_u32.c
* Description: uint32_t bitwise exclusive OR
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup Xor
@{
*/
/**
@brief Compute the logical bitwise XOR of two fixed-point vectors.
@param[in] pSrcA points to input vector A
@param[in] pSrcB points to input vector B
@param[out] pDst points to output vector
@param[in] blockSize number of samples in each vector
@return none
*/
void arm_xor_u32(
const uint32_t * pSrcA,
const uint32_t * pSrcB,
uint32_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
uint32x4_t vecSrcA, vecSrcB;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2;
while (blkCnt > 0U)
{
vecSrcA = vld1q(pSrcA);
vecSrcB = vld1q(pSrcB);
vst1q(pDst, veorq_u32(vecSrcA, vecSrcB) );
pSrcA += 4;
pSrcB += 4;
pDst += 4;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
vecSrcA = vld1q(pSrcA);
vecSrcB = vld1q(pSrcB);
vstrwq_p(pDst, veorq_u32(vecSrcA, vecSrcB), p0);
}
#else
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
uint32x4_t vecA, vecB;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
vecA = vld1q_u32(pSrcA);
vecB = vld1q_u32(pSrcB);
vst1q_u32(pDst, veorq_u32(vecA, vecB) );
pSrcA += 4;
pSrcB += 4;
pDst += 4;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 3;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif
while (blkCnt > 0U)
{
*pDst++ = (*pSrcA++)^(*pSrcB++);
/* Decrement the loop counter */
blkCnt--;
}
#endif /* if defined(ARM_MATH_MVEI) */
}
/**
@} end of Xor group
*/

View File

@@ -0,0 +1,129 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_xor_u8.c
* Description: uint8_t bitwise exclusive OR
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup Xor
@{
*/
/**
@brief Compute the logical bitwise XOR of two fixed-point vectors.
@param[in] pSrcA points to input vector A
@param[in] pSrcB points to input vector B
@param[out] pDst points to output vector
@param[in] blockSize number of samples in each vector
@return none
*/
void arm_xor_u8(
const uint8_t * pSrcA,
const uint8_t * pSrcB,
uint8_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
uint8x16_t vecSrcA, vecSrcB;
/* Compute 16 outputs at a time */
blkCnt = blockSize >> 4;
while (blkCnt > 0U)
{
vecSrcA = vld1q(pSrcA);
vecSrcB = vld1q(pSrcB);
vst1q(pDst, veorq_u8(vecSrcA, vecSrcB) );
pSrcA += 16;
pSrcB += 16;
pDst += 16;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0xF;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp8q(blkCnt);
vecSrcA = vld1q(pSrcA);
vecSrcB = vld1q(pSrcB);
vstrbq_p(pDst, veorq_u8(vecSrcA, vecSrcB), p0);
}
#else
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
uint8x16_t vecA, vecB;
/* Compute 16 outputs at a time */
blkCnt = blockSize >> 4U;
while (blkCnt > 0U)
{
vecA = vld1q_u8(pSrcA);
vecB = vld1q_u8(pSrcB);
vst1q_u8(pDst, veorq_u8(vecA, vecB) );
pSrcA += 16;
pSrcB += 16;
pDst += 16;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0xF;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif
while (blkCnt > 0U)
{
*pDst++ = (*pSrcA++)^(*pSrcB++);
/* Decrement the loop counter */
blkCnt--;
}
#endif /* if defined(ARM_MATH_MVEI) */
}
/**
@} end of Xor group
*/

View File

@@ -0,0 +1,29 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: BayesFunctions.c
* Description: Combination of all bayes function source files.
*
* $Date: 16. March 2020
* $Revision: V1.0.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2020 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_gaussian_naive_bayes_predict_f32.c"

View File

@@ -0,0 +1,27 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: BayesFunctions.c
* Description: Combination of all bayes function f16 source files.
*
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2020 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_gaussian_naive_bayes_predict_f16.c"

View File

@@ -0,0 +1,207 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_naive_gaussian_bayes_predict_f16
* Description: Naive Gaussian Bayesian Estimator
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/bayes_functions_f16.h"
#if defined(ARM_FLOAT16_SUPPORTED)
#include <limits.h>
#include <math.h>
/**
* @addtogroup groupBayes
* @{
*/
/**
* @brief Naive Gaussian Bayesian Estimator
*
* @param[in] *S points to a naive bayes instance structure
* @param[in] *in points to the elements of the input vector.
* @param[out] *pOutputProbabilities points to a buffer of length numberOfClasses containing estimated probabilities
* @param[out] *pBufferB points to a temporary buffer of length numberOfClasses
* @return The predicted class
*
*
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
#include "arm_vec_math_f16.h"
uint32_t arm_gaussian_naive_bayes_predict_f16(const arm_gaussian_naive_bayes_instance_f16 *S,
const float16_t * in,
float16_t *pOutputProbabilities,
float16_t *pBufferB
)
{
uint32_t nbClass;
const float16_t *pTheta = S->theta;
const float16_t *pSigma = S->sigma;
float16_t *buffer = pOutputProbabilities;
const float16_t *pIn = in;
float16_t result;
f16x8_t vsigma;
_Float16 tmp;
f16x8_t vacc1, vacc2;
uint32_t index;
float16_t *logclassPriors=pBufferB;
float16_t *pLogPrior = logclassPriors;
arm_vlog_f16((float16_t *) S->classPriors, logclassPriors, S->numberOfClasses);
pTheta = S->theta;
pSigma = S->sigma;
for (nbClass = 0; nbClass < S->numberOfClasses; nbClass++) {
pIn = in;
vacc1 = vdupq_n_f16(0.0f16);
vacc2 = vdupq_n_f16(0.0f16);
uint32_t blkCnt =S->vectorDimension >> 3;
while (blkCnt > 0U) {
f16x8_t vinvSigma, vtmp;
vsigma = vaddq_n_f16(vld1q(pSigma), S->epsilon);
vacc1 = vaddq(vacc1, vlogq_f16(vmulq_n_f16(vsigma, 2.0f16 * (_Float16)PI)));
vinvSigma = vrecip_medprec_f16(vsigma);
vtmp = vsubq(vld1q(pIn), vld1q(pTheta));
/* squaring */
vtmp = vmulq(vtmp, vtmp);
vacc2 = vfmaq(vacc2, vtmp, vinvSigma);
pIn += 8;
pTheta += 8;
pSigma += 8;
blkCnt--;
}
blkCnt = S->vectorDimension & 7;
if (blkCnt > 0U) {
mve_pred16_t p0 = vctp16q(blkCnt);
f16x8_t vinvSigma, vtmp;
vsigma = vaddq_n_f16(vld1q(pSigma), S->epsilon);
vacc1 =
vaddq_m_f16(vacc1, vacc1, vlogq_f16(vmulq_n_f16(vsigma, 2.0f16 * (_Float16)PI)), p0);
vinvSigma = vrecip_medprec_f16(vsigma);
vtmp = vsubq(vld1q(pIn), vld1q(pTheta));
/* squaring */
vtmp = vmulq(vtmp, vtmp);
vacc2 = vfmaq_m_f16(vacc2, vtmp, vinvSigma, p0);
pTheta += blkCnt;
pSigma += blkCnt;
}
tmp = -0.5f16 * (_Float16)vecAddAcrossF16Mve(vacc1);
tmp -= 0.5f16 * (_Float16)vecAddAcrossF16Mve(vacc2);
*buffer = (_Float16)tmp + (_Float16)*pLogPrior++;
buffer++;
}
arm_max_f16(pOutputProbabilities, S->numberOfClasses, &result, &index);
return (index);
}
#else
uint32_t arm_gaussian_naive_bayes_predict_f16(const arm_gaussian_naive_bayes_instance_f16 *S,
const float16_t * in,
float16_t *pOutputProbabilities,
float16_t *pBufferB)
{
uint32_t nbClass;
uint32_t nbDim;
const float16_t *pPrior = S->classPriors;
const float16_t *pTheta = S->theta;
const float16_t *pSigma = S->sigma;
float16_t *buffer = pOutputProbabilities;
const float16_t *pIn=in;
float16_t result;
_Float16 sigma;
_Float16 tmp;
_Float16 acc1,acc2;
uint32_t index;
(void)pBufferB;
pTheta=S->theta;
pSigma=S->sigma;
for(nbClass = 0; nbClass < S->numberOfClasses; nbClass++)
{
pIn = in;
tmp = 0.0f16;
acc1 = 0.0f16;
acc2 = 0.0f16;
for(nbDim = 0; nbDim < S->vectorDimension; nbDim++)
{
sigma = (_Float16)*pSigma + (_Float16)S->epsilon;
acc1 += (_Float16)logf(2.0f * PI * (float32_t)sigma);
acc2 += ((_Float16)*pIn - (_Float16)*pTheta) * ((_Float16)*pIn - (_Float16)*pTheta) / (_Float16)sigma;
pIn++;
pTheta++;
pSigma++;
}
tmp = -0.5f16 * (_Float16)acc1;
tmp -= 0.5f16 * (_Float16)acc2;
*buffer = (_Float16)tmp + (_Float16)logf((float32_t)*pPrior++);
buffer++;
}
arm_max_f16(pOutputProbabilities,S->numberOfClasses,&result,&index);
return(index);
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
* @} end of groupBayes group
*/
#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */

View File

@@ -0,0 +1,396 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_naive_gaussian_bayes_predict_f32
* Description: Naive Gaussian Bayesian Estimator
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/bayes_functions.h"
#include <limits.h>
#include <math.h>
#define PI_F 3.1415926535897932384626433832795f
#define DPI_F (2.0f*3.1415926535897932384626433832795f)
/**
* @addtogroup groupBayes
* @{
*/
/**
* @brief Naive Gaussian Bayesian Estimator
*
* @param[in] *S points to a naive bayes instance structure
* @param[in] *in points to the elements of the input vector.
* @param[out] *pOutputProbabilities points to a buffer of length numberOfClasses containing estimated probabilities
* @param[out] *pBufferB points to a temporary buffer of length numberOfClasses
* @return The predicted class
*
*
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
#include "arm_vec_math.h"
uint32_t arm_gaussian_naive_bayes_predict_f32(const arm_gaussian_naive_bayes_instance_f32 *S,
const float32_t * in,
float32_t *pOutputProbabilities,
float32_t *pBufferB
)
{
uint32_t nbClass;
const float32_t *pTheta = S->theta;
const float32_t *pSigma = S->sigma;
float32_t *buffer = pOutputProbabilities;
const float32_t *pIn = in;
float32_t result;
f32x4_t vsigma;
float32_t tmp;
f32x4_t vacc1, vacc2;
uint32_t index;
float32_t *logclassPriors=pBufferB;
float32_t *pLogPrior = logclassPriors;
arm_vlog_f32((float32_t *) S->classPriors, logclassPriors, S->numberOfClasses);
pTheta = S->theta;
pSigma = S->sigma;
for (nbClass = 0; nbClass < S->numberOfClasses; nbClass++) {
pIn = in;
vacc1 = vdupq_n_f32(0);
vacc2 = vdupq_n_f32(0);
uint32_t blkCnt =S->vectorDimension >> 2;
while (blkCnt > 0U) {
f32x4_t vinvSigma, vtmp;
vsigma = vaddq_n_f32(vld1q(pSigma), S->epsilon);
vacc1 = vaddq(vacc1, vlogq_f32(vmulq_n_f32(vsigma, 2.0f * PI)));
vinvSigma = vrecip_medprec_f32(vsigma);
vtmp = vsubq(vld1q(pIn), vld1q(pTheta));
/* squaring */
vtmp = vmulq(vtmp, vtmp);
vacc2 = vfmaq(vacc2, vtmp, vinvSigma);
pIn += 4;
pTheta += 4;
pSigma += 4;
blkCnt--;
}
blkCnt = S->vectorDimension & 3;
if (blkCnt > 0U) {
mve_pred16_t p0 = vctp32q(blkCnt);
f32x4_t vinvSigma, vtmp;
vsigma = vaddq_n_f32(vld1q(pSigma), S->epsilon);
vacc1 =
vaddq_m_f32(vacc1, vacc1, vlogq_f32(vmulq_n_f32(vsigma, 2.0f * PI)), p0);
vinvSigma = vrecip_medprec_f32(vsigma);
vtmp = vsubq(vld1q(pIn), vld1q(pTheta));
/* squaring */
vtmp = vmulq(vtmp, vtmp);
vacc2 = vfmaq_m_f32(vacc2, vtmp, vinvSigma, p0);
pTheta += blkCnt;
pSigma += blkCnt;
}
tmp = -0.5f * vecAddAcrossF32Mve(vacc1);
tmp -= 0.5f * vecAddAcrossF32Mve(vacc2);
*buffer = tmp + *pLogPrior++;
buffer++;
}
arm_max_f32(pOutputProbabilities, S->numberOfClasses, &result, &index);
return (index);
}
#else
#if defined(ARM_MATH_NEON)
#include "NEMath.h"
uint32_t arm_gaussian_naive_bayes_predict_f32(const arm_gaussian_naive_bayes_instance_f32 *S,
const float32_t * in,
float32_t *pOutputProbabilities,
float32_t *pBufferB)
{
const float32_t *pPrior = S->classPriors;
const float32_t *pTheta = S->theta;
const float32_t *pSigma = S->sigma;
const float32_t *pTheta1 = S->theta + S->vectorDimension;
const float32_t *pSigma1 = S->sigma + S->vectorDimension;
float32_t *buffer = pOutputProbabilities;
const float32_t *pIn=in;
float32_t result;
float32_t sigma,sigma1;
float32_t tmp,tmp1;
uint32_t index;
uint32_t vecBlkCnt;
uint32_t classBlkCnt;
float32x4_t epsilonV;
float32x4_t sigmaV,sigmaV1;
float32x4_t tmpV,tmpVb,tmpV1;
float32x2_t tmpV2;
float32x4_t thetaV,thetaV1;
float32x4_t inV;
(void)pBufferB;
epsilonV = vdupq_n_f32(S->epsilon);
classBlkCnt = S->numberOfClasses >> 1;
while(classBlkCnt > 0)
{
pIn = in;
tmp = logf(*pPrior++);
tmp1 = logf(*pPrior++);
tmpV = vdupq_n_f32(0.0f);
tmpV1 = vdupq_n_f32(0.0f);
vecBlkCnt = S->vectorDimension >> 2;
while(vecBlkCnt > 0)
{
sigmaV = vld1q_f32(pSigma);
thetaV = vld1q_f32(pTheta);
sigmaV1 = vld1q_f32(pSigma1);
thetaV1 = vld1q_f32(pTheta1);
inV = vld1q_f32(pIn);
sigmaV = vaddq_f32(sigmaV, epsilonV);
sigmaV1 = vaddq_f32(sigmaV1, epsilonV);
tmpVb = vmulq_n_f32(sigmaV,DPI_F);
tmpVb = vlogq_f32(tmpVb);
tmpV = vmlsq_n_f32(tmpV,tmpVb,0.5f);
tmpVb = vmulq_n_f32(sigmaV1,DPI_F);
tmpVb = vlogq_f32(tmpVb);
tmpV1 = vmlsq_n_f32(tmpV1,tmpVb,0.5f);
tmpVb = vsubq_f32(inV,thetaV);
tmpVb = vmulq_f32(tmpVb,tmpVb);
tmpVb = vmulq_f32(tmpVb, vinvq_f32(sigmaV));
tmpV = vmlsq_n_f32(tmpV,tmpVb,0.5f);
tmpVb = vsubq_f32(inV,thetaV1);
tmpVb = vmulq_f32(tmpVb,tmpVb);
tmpVb = vmulq_f32(tmpVb, vinvq_f32(sigmaV1));
tmpV1 = vmlsq_n_f32(tmpV1,tmpVb,0.5f);
pIn += 4;
pTheta += 4;
pSigma += 4;
pTheta1 += 4;
pSigma1 += 4;
vecBlkCnt--;
}
tmpV2 = vpadd_f32(vget_low_f32(tmpV),vget_high_f32(tmpV));
tmp += vget_lane_f32(tmpV2, 0) + vget_lane_f32(tmpV2, 1);
tmpV2 = vpadd_f32(vget_low_f32(tmpV1),vget_high_f32(tmpV1));
tmp1 += vget_lane_f32(tmpV2, 0) + vget_lane_f32(tmpV2, 1);
vecBlkCnt = S->vectorDimension & 3;
while(vecBlkCnt > 0)
{
sigma = *pSigma + S->epsilon;
sigma1 = *pSigma1 + S->epsilon;
tmp -= 0.5f*logf(2.0f * PI_F * sigma);
tmp -= 0.5f*(*pIn - *pTheta) * (*pIn - *pTheta) / sigma;
tmp1 -= 0.5f*logf(2.0f * PI_F * sigma1);
tmp1 -= 0.5f*(*pIn - *pTheta1) * (*pIn - *pTheta1) / sigma1;
pIn++;
pTheta++;
pSigma++;
pTheta1++;
pSigma1++;
vecBlkCnt--;
}
*buffer++ = tmp;
*buffer++ = tmp1;
pSigma += S->vectorDimension;
pTheta += S->vectorDimension;
pSigma1 += S->vectorDimension;
pTheta1 += S->vectorDimension;
classBlkCnt--;
}
classBlkCnt = S->numberOfClasses & 1;
while(classBlkCnt > 0)
{
pIn = in;
tmp = logf(*pPrior++);
tmpV = vdupq_n_f32(0.0f);
vecBlkCnt = S->vectorDimension >> 2;
while(vecBlkCnt > 0)
{
sigmaV = vld1q_f32(pSigma);
thetaV = vld1q_f32(pTheta);
inV = vld1q_f32(pIn);
sigmaV = vaddq_f32(sigmaV, epsilonV);
tmpVb = vmulq_n_f32(sigmaV,DPI_F);
tmpVb = vlogq_f32(tmpVb);
tmpV = vmlsq_n_f32(tmpV,tmpVb,0.5f);
tmpVb = vsubq_f32(inV,thetaV);
tmpVb = vmulq_f32(tmpVb,tmpVb);
tmpVb = vmulq_f32(tmpVb, vinvq_f32(sigmaV));
tmpV = vmlsq_n_f32(tmpV,tmpVb,0.5f);
pIn += 4;
pTheta += 4;
pSigma += 4;
vecBlkCnt--;
}
tmpV2 = vpadd_f32(vget_low_f32(tmpV),vget_high_f32(tmpV));
tmp += vget_lane_f32(tmpV2, 0) + vget_lane_f32(tmpV2, 1);
vecBlkCnt = S->vectorDimension & 3;
while(vecBlkCnt > 0)
{
sigma = *pSigma + S->epsilon;
tmp -= 0.5f*logf(2.0f * PI_F * sigma);
tmp -= 0.5f*(*pIn - *pTheta) * (*pIn - *pTheta) / sigma;
pIn++;
pTheta++;
pSigma++;
vecBlkCnt--;
}
*buffer++ = tmp;
classBlkCnt--;
}
arm_max_f32(pOutputProbabilities,S->numberOfClasses,&result,&index);
return(index);
}
#else
uint32_t arm_gaussian_naive_bayes_predict_f32(const arm_gaussian_naive_bayes_instance_f32 *S,
const float32_t * in,
float32_t *pOutputProbabilities,
float32_t *pBufferB)
{
uint32_t nbClass;
uint32_t nbDim;
const float32_t *pPrior = S->classPriors;
const float32_t *pTheta = S->theta;
const float32_t *pSigma = S->sigma;
float32_t *buffer = pOutputProbabilities;
const float32_t *pIn=in;
float32_t result;
float32_t sigma;
float32_t tmp;
float32_t acc1,acc2;
uint32_t index;
(void)pBufferB;
pTheta=S->theta;
pSigma=S->sigma;
for(nbClass = 0; nbClass < S->numberOfClasses; nbClass++)
{
pIn = in;
tmp = 0.0;
acc1 = 0.0f;
acc2 = 0.0f;
for(nbDim = 0; nbDim < S->vectorDimension; nbDim++)
{
sigma = *pSigma + S->epsilon;
acc1 += logf(2.0f * PI_F * sigma);
acc2 += (*pIn - *pTheta) * (*pIn - *pTheta) / sigma;
pIn++;
pTheta++;
pSigma++;
}
tmp = -0.5f * acc1;
tmp -= 0.5f * acc2;
*buffer = tmp + logf(*pPrior++);
buffer++;
}
arm_max_f32(pOutputProbabilities,S->numberOfClasses,&result,&index);
return(index);
}
#endif
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
* @} end of groupBayes group
*/

View File

@@ -0,0 +1,31 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: CommonTables.c
* Description: Combination of all common table source files.
*
* $Date: 08. January 2020
* $Revision: V1.1.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2019-2020 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_common_tables.c"
#include "arm_const_structs.c"
#include "arm_mve_tables.c"

View File

@@ -0,0 +1,31 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: CommonTables.c
* Description: Combination of all common table source files.
*
* $Date: 08. January 2020
* $Revision: V1.1.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2019-2020 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_common_tables_f16.c"
#include "arm_const_structs_f16.c"
#include "arm_mve_tables_f16.c"

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,654 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_const_structs.c
* Description: Constant structs that are initialized for user convenience.
* For example, some can be given as arguments to the arm_cfft_f32() or arm_rfft_f32() functions.
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math_types.h"
#include "arm_const_structs.h"
/*
ALLOW TABLE is true when config table is enabled and the Tramsform folder is included
for compilation.
*/
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
/* Floating-point structs */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_16) && defined(ARM_TABLE_BITREVIDX_FLT64_16))
const arm_cfft_instance_f64 arm_cfft_sR_f64_len16 = {
16, (const float64_t *)twiddleCoefF64_16, armBitRevIndexTableF64_16, ARMBITREVINDEXTABLEF64_16_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_32) && defined(ARM_TABLE_BITREVIDX_FLT64_32))
const arm_cfft_instance_f64 arm_cfft_sR_f64_len32 = {
32, (const float64_t *)twiddleCoefF64_32, armBitRevIndexTableF64_32, ARMBITREVINDEXTABLEF64_32_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_64) && defined(ARM_TABLE_BITREVIDX_FLT64_64))
const arm_cfft_instance_f64 arm_cfft_sR_f64_len64 = {
64, (const float64_t *)twiddleCoefF64_64, armBitRevIndexTableF64_64, ARMBITREVINDEXTABLEF64_64_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_128) && defined(ARM_TABLE_BITREVIDX_FLT64_128))
const arm_cfft_instance_f64 arm_cfft_sR_f64_len128 = {
128, (const float64_t *)twiddleCoefF64_128, armBitRevIndexTableF64_128, ARMBITREVINDEXTABLEF64_128_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_256) && defined(ARM_TABLE_BITREVIDX_FLT64_256))
const arm_cfft_instance_f64 arm_cfft_sR_f64_len256 = {
256, (const float64_t *)twiddleCoefF64_256, armBitRevIndexTableF64_256, ARMBITREVINDEXTABLEF64_256_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_512) && defined(ARM_TABLE_BITREVIDX_FLT64_512))
const arm_cfft_instance_f64 arm_cfft_sR_f64_len512 = {
512, (const float64_t *)twiddleCoefF64_512, armBitRevIndexTableF64_512, ARMBITREVINDEXTABLEF64_512_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_1024) && defined(ARM_TABLE_BITREVIDX_FLT64_1024))
const arm_cfft_instance_f64 arm_cfft_sR_f64_len1024 = {
1024, (const float64_t *)twiddleCoefF64_1024, armBitRevIndexTableF64_1024, ARMBITREVINDEXTABLEF64_1024_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_2048) && defined(ARM_TABLE_BITREVIDX_FLT64_2048))
const arm_cfft_instance_f64 arm_cfft_sR_f64_len2048 = {
2048, (const float64_t *)twiddleCoefF64_2048, armBitRevIndexTableF64_2048, ARMBITREVINDEXTABLEF64_2048_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_4096) && defined(ARM_TABLE_BITREVIDX_FLT64_4096))
const arm_cfft_instance_f64 arm_cfft_sR_f64_len4096 = {
4096, (const float64_t *)twiddleCoefF64_4096, armBitRevIndexTableF64_4096, ARMBITREVINDEXTABLEF64_4096_TABLE_LENGTH
};
#endif
/* Floating-point structs */
#if !defined(ARM_MATH_MVEF) || defined(ARM_MATH_AUTOVECTORIZE)
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_16) && defined(ARM_TABLE_BITREVIDX_FLT_16))
const arm_cfft_instance_f32 arm_cfft_sR_f32_len16 = {
16, twiddleCoef_16, armBitRevIndexTable16, ARMBITREVINDEXTABLE_16_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_32) && defined(ARM_TABLE_BITREVIDX_FLT_32))
const arm_cfft_instance_f32 arm_cfft_sR_f32_len32 = {
32, twiddleCoef_32, armBitRevIndexTable32, ARMBITREVINDEXTABLE_32_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_64) && defined(ARM_TABLE_BITREVIDX_FLT_64))
const arm_cfft_instance_f32 arm_cfft_sR_f32_len64 = {
64, twiddleCoef_64, armBitRevIndexTable64, ARMBITREVINDEXTABLE_64_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_128) && defined(ARM_TABLE_BITREVIDX_FLT_128))
const arm_cfft_instance_f32 arm_cfft_sR_f32_len128 = {
128, twiddleCoef_128, armBitRevIndexTable128, ARMBITREVINDEXTABLE_128_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_256) && defined(ARM_TABLE_BITREVIDX_FLT_256))
const arm_cfft_instance_f32 arm_cfft_sR_f32_len256 = {
256, twiddleCoef_256, armBitRevIndexTable256, ARMBITREVINDEXTABLE_256_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_512) && defined(ARM_TABLE_BITREVIDX_FLT_512))
const arm_cfft_instance_f32 arm_cfft_sR_f32_len512 = {
512, twiddleCoef_512, armBitRevIndexTable512, ARMBITREVINDEXTABLE_512_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_1024) && defined(ARM_TABLE_BITREVIDX_FLT_1024))
const arm_cfft_instance_f32 arm_cfft_sR_f32_len1024 = {
1024, twiddleCoef_1024, armBitRevIndexTable1024, ARMBITREVINDEXTABLE_1024_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_2048) && defined(ARM_TABLE_BITREVIDX_FLT_2048))
const arm_cfft_instance_f32 arm_cfft_sR_f32_len2048 = {
2048, twiddleCoef_2048, armBitRevIndexTable2048, ARMBITREVINDEXTABLE_2048_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_4096) && defined(ARM_TABLE_BITREVIDX_FLT_4096))
const arm_cfft_instance_f32 arm_cfft_sR_f32_len4096 = {
4096, twiddleCoef_4096, armBitRevIndexTable4096, ARMBITREVINDEXTABLE_4096_TABLE_LENGTH
};
#endif
#endif /* !defined(ARM_MATH_MVEF) || defined(ARM_MATH_AUTOVECTORIZE) */
/* Fixed-point structs */
#if !defined(ARM_MATH_MVEI) || defined(ARM_MATH_AUTOVECTORIZE)
/*
Those structures cannot be used to initialize the MVE version of the FFT Q31 instances.
So they are not compiled when MVE is defined.
For the MVE version, the new arm_cfft_init_f32 must be used.
*/
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_16) && defined(ARM_TABLE_BITREVIDX_FXT_16))
const arm_cfft_instance_q31 arm_cfft_sR_q31_len16 = {
16, twiddleCoef_16_q31, armBitRevIndexTable_fixed_16, ARMBITREVINDEXTABLE_FIXED_16_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_32) && defined(ARM_TABLE_BITREVIDX_FXT_32))
const arm_cfft_instance_q31 arm_cfft_sR_q31_len32 = {
32, twiddleCoef_32_q31, armBitRevIndexTable_fixed_32, ARMBITREVINDEXTABLE_FIXED_32_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_64) && defined(ARM_TABLE_BITREVIDX_FXT_64))
const arm_cfft_instance_q31 arm_cfft_sR_q31_len64 = {
64, twiddleCoef_64_q31, armBitRevIndexTable_fixed_64, ARMBITREVINDEXTABLE_FIXED_64_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_128) && defined(ARM_TABLE_BITREVIDX_FXT_128))
const arm_cfft_instance_q31 arm_cfft_sR_q31_len128 = {
128, twiddleCoef_128_q31, armBitRevIndexTable_fixed_128, ARMBITREVINDEXTABLE_FIXED_128_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_256) && defined(ARM_TABLE_BITREVIDX_FXT_256))
const arm_cfft_instance_q31 arm_cfft_sR_q31_len256 = {
256, twiddleCoef_256_q31, armBitRevIndexTable_fixed_256, ARMBITREVINDEXTABLE_FIXED_256_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_512) && defined(ARM_TABLE_BITREVIDX_FXT_512))
const arm_cfft_instance_q31 arm_cfft_sR_q31_len512 = {
512, twiddleCoef_512_q31, armBitRevIndexTable_fixed_512, ARMBITREVINDEXTABLE_FIXED_512_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_1024) && defined(ARM_TABLE_BITREVIDX_FXT_1024))
const arm_cfft_instance_q31 arm_cfft_sR_q31_len1024 = {
1024, twiddleCoef_1024_q31, armBitRevIndexTable_fixed_1024, ARMBITREVINDEXTABLE_FIXED_1024_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_2048) && defined(ARM_TABLE_BITREVIDX_FXT_2048))
const arm_cfft_instance_q31 arm_cfft_sR_q31_len2048 = {
2048, twiddleCoef_2048_q31, armBitRevIndexTable_fixed_2048, ARMBITREVINDEXTABLE_FIXED_2048_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_4096) && defined(ARM_TABLE_BITREVIDX_FXT_4096))
const arm_cfft_instance_q31 arm_cfft_sR_q31_len4096 = {
4096, twiddleCoef_4096_q31, armBitRevIndexTable_fixed_4096, ARMBITREVINDEXTABLE_FIXED_4096_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_16) && defined(ARM_TABLE_BITREVIDX_FXT_16))
const arm_cfft_instance_q15 arm_cfft_sR_q15_len16 = {
16, twiddleCoef_16_q15, armBitRevIndexTable_fixed_16, ARMBITREVINDEXTABLE_FIXED_16_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_32) && defined(ARM_TABLE_BITREVIDX_FXT_32))
const arm_cfft_instance_q15 arm_cfft_sR_q15_len32 = {
32, twiddleCoef_32_q15, armBitRevIndexTable_fixed_32, ARMBITREVINDEXTABLE_FIXED_32_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_64) && defined(ARM_TABLE_BITREVIDX_FXT_64))
const arm_cfft_instance_q15 arm_cfft_sR_q15_len64 = {
64, twiddleCoef_64_q15, armBitRevIndexTable_fixed_64, ARMBITREVINDEXTABLE_FIXED_64_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_128) && defined(ARM_TABLE_BITREVIDX_FXT_128))
const arm_cfft_instance_q15 arm_cfft_sR_q15_len128 = {
128, twiddleCoef_128_q15, armBitRevIndexTable_fixed_128, ARMBITREVINDEXTABLE_FIXED_128_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_256) && defined(ARM_TABLE_BITREVIDX_FXT_256))
const arm_cfft_instance_q15 arm_cfft_sR_q15_len256 = {
256, twiddleCoef_256_q15, armBitRevIndexTable_fixed_256, ARMBITREVINDEXTABLE_FIXED_256_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_512) && defined(ARM_TABLE_BITREVIDX_FXT_512))
const arm_cfft_instance_q15 arm_cfft_sR_q15_len512 = {
512, twiddleCoef_512_q15, armBitRevIndexTable_fixed_512, ARMBITREVINDEXTABLE_FIXED_512_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_1024) && defined(ARM_TABLE_BITREVIDX_FXT_1024))
const arm_cfft_instance_q15 arm_cfft_sR_q15_len1024 = {
1024, twiddleCoef_1024_q15, armBitRevIndexTable_fixed_1024, ARMBITREVINDEXTABLE_FIXED_1024_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_2048) && defined(ARM_TABLE_BITREVIDX_FXT_2048))
const arm_cfft_instance_q15 arm_cfft_sR_q15_len2048 = {
2048, twiddleCoef_2048_q15, armBitRevIndexTable_fixed_2048, ARMBITREVINDEXTABLE_FIXED_2048_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_4096) && defined(ARM_TABLE_BITREVIDX_FXT_4096))
const arm_cfft_instance_q15 arm_cfft_sR_q15_len4096 = {
4096, twiddleCoef_4096_q15, armBitRevIndexTable_fixed_4096, ARMBITREVINDEXTABLE_FIXED_4096_TABLE_LENGTH
};
#endif
#endif /* !defined(ARM_MATH_MVEI) */
/* Structure for real-value inputs */
/* Double precision strucs */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_32) && defined(ARM_TABLE_BITREVIDX_FLT64_32) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_32))
const arm_rfft_fast_instance_f64 arm_rfft_fast_sR_f64_len32 = {
{ 16, (const float64_t *)twiddleCoefF64_16, armBitRevIndexTableF64_16, ARMBITREVINDEXTABLEF64_16_TABLE_LENGTH },
32U,
(float64_t *)twiddleCoefF64_rfft_32
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_64) && defined(ARM_TABLE_BITREVIDX_FLT64_64) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_64))
const arm_rfft_fast_instance_f64 arm_rfft_fast_sR_f64_len64 = {
{ 32, (const float64_t *)twiddleCoefF64_32, armBitRevIndexTableF64_32, ARMBITREVINDEXTABLEF64_32_TABLE_LENGTH },
64U,
(float64_t *)twiddleCoefF64_rfft_64
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_128) && defined(ARM_TABLE_BITREVIDX_FLT64_128) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_128))
const arm_rfft_fast_instance_f64 arm_rfft_fast_sR_f64_len128 = {
{ 64, (const float64_t *)twiddleCoefF64_64, armBitRevIndexTableF64_64, ARMBITREVINDEXTABLEF64_64_TABLE_LENGTH },
128U,
(float64_t *)twiddleCoefF64_rfft_128
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_256) && defined(ARM_TABLE_BITREVIDX_FLT64_256) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_256))
const arm_rfft_fast_instance_f64 arm_rfft_fast_sR_f64_len256 = {
{ 128, (const float64_t *)twiddleCoefF64_128, armBitRevIndexTableF64_128, ARMBITREVINDEXTABLEF64_128_TABLE_LENGTH },
256U,
(float64_t *)twiddleCoefF64_rfft_256
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_512) && defined(ARM_TABLE_BITREVIDX_FLT64_512) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_512))
const arm_rfft_fast_instance_f64 arm_rfft_fast_sR_f64_len512 = {
{ 256, (const float64_t *)twiddleCoefF64_256, armBitRevIndexTableF64_256, ARMBITREVINDEXTABLEF64_256_TABLE_LENGTH },
512U,
(float64_t *)twiddleCoefF64_rfft_512
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_1024) && defined(ARM_TABLE_BITREVIDX_FLT64_1024) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_1024))
const arm_rfft_fast_instance_f64 arm_rfft_fast_sR_f64_len1024 = {
{ 512, (const float64_t *)twiddleCoefF64_512, armBitRevIndexTableF64_512, ARMBITREVINDEXTABLEF64_512_TABLE_LENGTH },
1024U,
(float64_t *)twiddleCoefF64_rfft_1024
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_2048) && defined(ARM_TABLE_BITREVIDX_FLT64_2048) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_2048))
const arm_rfft_fast_instance_f64 arm_rfft_fast_sR_f64_len2048 = {
{ 1024, (const float64_t *)twiddleCoefF64_1024, armBitRevIndexTableF64_1024, ARMBITREVINDEXTABLEF64_1024_TABLE_LENGTH },
2048U,
(float64_t *)twiddleCoefF64_rfft_2048
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_4096) && defined(ARM_TABLE_BITREVIDX_FLT64_4096) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_4096))
const arm_rfft_fast_instance_f64 arm_rfft_fast_sR_f64_len4096 = {
{ 2048, (const float64_t *)twiddleCoefF64_2048, armBitRevIndexTableF64_2048, ARMBITREVINDEXTABLEF64_2048_TABLE_LENGTH },
4096U,
(float64_t *)twiddleCoefF64_rfft_4096
};
#endif
/* Floating-point structs */
#if !defined(ARM_MATH_MVEF) || defined(ARM_MATH_AUTOVECTORIZE)
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_16) && defined(ARM_TABLE_BITREVIDX_FLT_16) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_32))
const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len32 = {
{ 16, twiddleCoef_16, armBitRevIndexTable16, ARMBITREVINDEXTABLE_16_TABLE_LENGTH },
32U,
(float32_t *)twiddleCoef_rfft_32
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_32) && defined(ARM_TABLE_BITREVIDX_FLT_32) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_64))
const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len64 = {
{ 32, twiddleCoef_32, armBitRevIndexTable32, ARMBITREVINDEXTABLE_32_TABLE_LENGTH },
64U,
(float32_t *)twiddleCoef_rfft_64
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_64) && defined(ARM_TABLE_BITREVIDX_FLT_64) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_128))
const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len128 = {
{ 64, twiddleCoef_64, armBitRevIndexTable64, ARMBITREVINDEXTABLE_64_TABLE_LENGTH },
128U,
(float32_t *)twiddleCoef_rfft_128
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_128) && defined(ARM_TABLE_BITREVIDX_FLT_128) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_256))
const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len256 = {
{ 128, twiddleCoef_128, armBitRevIndexTable128, ARMBITREVINDEXTABLE_128_TABLE_LENGTH },
256U,
(float32_t *)twiddleCoef_rfft_256
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_256) && defined(ARM_TABLE_BITREVIDX_FLT_256) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_512))
const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len512 = {
{ 256, twiddleCoef_256, armBitRevIndexTable256, ARMBITREVINDEXTABLE_256_TABLE_LENGTH },
512U,
(float32_t *)twiddleCoef_rfft_512
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_512) && defined(ARM_TABLE_BITREVIDX_FLT_512) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_1024))
const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len1024 = {
{ 512, twiddleCoef_512, armBitRevIndexTable512, ARMBITREVINDEXTABLE_512_TABLE_LENGTH },
1024U,
(float32_t *)twiddleCoef_rfft_1024
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_1024) && defined(ARM_TABLE_BITREVIDX_FLT_1024) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_2048))
const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len2048 = {
{ 1024, twiddleCoef_1024, armBitRevIndexTable1024, ARMBITREVINDEXTABLE_1024_TABLE_LENGTH },
2048U,
(float32_t *)twiddleCoef_rfft_2048
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_2048) && defined(ARM_TABLE_BITREVIDX_FLT_2048) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_4096))
const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len4096 = {
{ 2048, twiddleCoef_2048, armBitRevIndexTable2048, ARMBITREVINDEXTABLE_2048_TABLE_LENGTH },
4096U,
(float32_t *)twiddleCoef_rfft_4096
};
#endif
#endif /* #if !defined(ARM_MATH_MVEF) || defined(ARM_MATH_AUTOVECTORIZE) */
/* Fixed-point structs */
/* q31_t */
#if !defined(ARM_MATH_MVEI) || defined(ARM_MATH_AUTOVECTORIZE)
/*
Those structures cannot be used to initialize the MVE version of the FFT Q31 instances.
So they are not compiled when MVE is defined.
For the MVE version, the new arm_cfft_init_f32 must be used.
*/
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q31) && defined(ARM_TABLE_TWIDDLECOEF_Q31_16) && defined(ARM_TABLE_BITREVIDX_FXT_16))
const arm_rfft_instance_q31 arm_rfft_sR_q31_len32 = {
32U,
0,
1,
256U,
(q31_t*)realCoefAQ31,
(q31_t*)realCoefBQ31,
&arm_cfft_sR_q31_len16
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q31) && defined(ARM_TABLE_TWIDDLECOEF_Q31_32) && defined(ARM_TABLE_BITREVIDX_FXT_32))
const arm_rfft_instance_q31 arm_rfft_sR_q31_len64 = {
64U,
0,
1,
128U,
(q31_t*)realCoefAQ31,
(q31_t*)realCoefBQ31,
&arm_cfft_sR_q31_len32
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q31) && defined(ARM_TABLE_TWIDDLECOEF_Q31_64) && defined(ARM_TABLE_BITREVIDX_FXT_64))
const arm_rfft_instance_q31 arm_rfft_sR_q31_len128 = {
128U,
0,
1,
64U,
(q31_t*)realCoefAQ31,
(q31_t*)realCoefBQ31,
&arm_cfft_sR_q31_len64
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q31) && defined(ARM_TABLE_TWIDDLECOEF_Q31_128) && defined(ARM_TABLE_BITREVIDX_FXT_128))
const arm_rfft_instance_q31 arm_rfft_sR_q31_len256 = {
256U,
0,
1,
32U,
(q31_t*)realCoefAQ31,
(q31_t*)realCoefBQ31,
&arm_cfft_sR_q31_len128
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q31) && defined(ARM_TABLE_TWIDDLECOEF_Q31_256) && defined(ARM_TABLE_BITREVIDX_FXT_256))
const arm_rfft_instance_q31 arm_rfft_sR_q31_len512 = {
512U,
0,
1,
16U,
(q31_t*)realCoefAQ31,
(q31_t*)realCoefBQ31,
&arm_cfft_sR_q31_len256
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q31) && defined(ARM_TABLE_TWIDDLECOEF_Q31_512) && defined(ARM_TABLE_BITREVIDX_FXT_512))
const arm_rfft_instance_q31 arm_rfft_sR_q31_len1024 = {
1024U,
0,
1,
8U,
(q31_t*)realCoefAQ31,
(q31_t*)realCoefBQ31,
&arm_cfft_sR_q31_len512
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q31) && defined(ARM_TABLE_TWIDDLECOEF_Q31_1024) && defined(ARM_TABLE_BITREVIDX_FXT_1024))
const arm_rfft_instance_q31 arm_rfft_sR_q31_len2048 = {
2048U,
0,
1,
4U,
(q31_t*)realCoefAQ31,
(q31_t*)realCoefBQ31,
&arm_cfft_sR_q31_len1024
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q31) && defined(ARM_TABLE_TWIDDLECOEF_Q31_2048) && defined(ARM_TABLE_BITREVIDX_FXT_2048))
const arm_rfft_instance_q31 arm_rfft_sR_q31_len4096 = {
4096U,
0,
1,
2U,
(q31_t*)realCoefAQ31,
(q31_t*)realCoefBQ31,
&arm_cfft_sR_q31_len2048
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q31) && defined(ARM_TABLE_TWIDDLECOEF_Q31_4096) && defined(ARM_TABLE_BITREVIDX_FXT_4096))
const arm_rfft_instance_q31 arm_rfft_sR_q31_len8192 = {
8192U,
0,
1,
1U,
(q31_t*)realCoefAQ31,
(q31_t*)realCoefBQ31,
&arm_cfft_sR_q31_len4096
};
#endif
/* q15_t */
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q15) && defined(ARM_TABLE_TWIDDLECOEF_Q15_16) && defined(ARM_TABLE_BITREVIDX_FXT_16))
const arm_rfft_instance_q15 arm_rfft_sR_q15_len32 = {
32U,
0,
1,
256U,
(q15_t*)realCoefAQ15,
(q15_t*)realCoefBQ15,
&arm_cfft_sR_q15_len16
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q15) && defined(ARM_TABLE_TWIDDLECOEF_Q15_32) && defined(ARM_TABLE_BITREVIDX_FXT_32))
const arm_rfft_instance_q15 arm_rfft_sR_q15_len64 = {
64U,
0,
1,
128U,
(q15_t*)realCoefAQ15,
(q15_t*)realCoefBQ15,
&arm_cfft_sR_q15_len32
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q15) && defined(ARM_TABLE_TWIDDLECOEF_Q15_64) && defined(ARM_TABLE_BITREVIDX_FXT_64))
const arm_rfft_instance_q15 arm_rfft_sR_q15_len128 = {
128U,
0,
1,
64U,
(q15_t*)realCoefAQ15,
(q15_t*)realCoefBQ15,
&arm_cfft_sR_q15_len64
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q15) && defined(ARM_TABLE_TWIDDLECOEF_Q15_128) && defined(ARM_TABLE_BITREVIDX_FXT_128))
const arm_rfft_instance_q15 arm_rfft_sR_q15_len256 = {
256U,
0,
1,
32U,
(q15_t*)realCoefAQ15,
(q15_t*)realCoefBQ15,
&arm_cfft_sR_q15_len128
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q15) && defined(ARM_TABLE_TWIDDLECOEF_Q15_256) && defined(ARM_TABLE_BITREVIDX_FXT_256))
const arm_rfft_instance_q15 arm_rfft_sR_q15_len512 = {
512U,
0,
1,
16U,
(q15_t*)realCoefAQ15,
(q15_t*)realCoefBQ15,
&arm_cfft_sR_q15_len256
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q15) && defined(ARM_TABLE_TWIDDLECOEF_Q15_512) && defined(ARM_TABLE_BITREVIDX_FXT_512))
const arm_rfft_instance_q15 arm_rfft_sR_q15_len1024 = {
1024U,
0,
1,
8U,
(q15_t*)realCoefAQ15,
(q15_t*)realCoefBQ15,
&arm_cfft_sR_q15_len512
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q15) && defined(ARM_TABLE_TWIDDLECOEF_Q15_1024) && defined(ARM_TABLE_BITREVIDX_FXT_1024))
const arm_rfft_instance_q15 arm_rfft_sR_q15_len2048 = {
2048U,
0,
1,
4U,
(q15_t*)realCoefAQ15,
(q15_t*)realCoefBQ15,
&arm_cfft_sR_q15_len1024
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q15) && defined(ARM_TABLE_TWIDDLECOEF_Q15_2048) && defined(ARM_TABLE_BITREVIDX_FXT_2048))
const arm_rfft_instance_q15 arm_rfft_sR_q15_len4096 = {
4096U,
0,
1,
2U,
(q15_t*)realCoefAQ15,
(q15_t*)realCoefBQ15,
&arm_cfft_sR_q15_len2048
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_REALCOEF_Q15) && defined(ARM_TABLE_TWIDDLECOEF_Q15_4096) && defined(ARM_TABLE_BITREVIDX_FXT_4096))
const arm_rfft_instance_q15 arm_rfft_sR_q15_len8192 = {
8192U,
0,
1,
1U,
(q15_t*)realCoefAQ15,
(q15_t*)realCoefBQ15,
&arm_cfft_sR_q15_len4096
};
#endif
#endif /* !defined(ARM_MATH_MVEI) */
#endif

View File

@@ -0,0 +1,120 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_const_structs_f16.c
* Description: Constant structs that are initialized for user convenience.
* For example, some can be given as arguments to the arm_cfft_f32() or arm_rfft_f32() functions.
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math_types_f16.h"
#if defined(ARM_FLOAT16_SUPPORTED)
#include "arm_const_structs_f16.h"
/*
ALLOW TABLE is true when config table is enabled and the Tramsform folder is included
for compilation.
*/
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
/* Floating-point structs */
#if !defined(ARM_MATH_MVE_FLOAT16) || defined(ARM_MATH_AUTOVECTORIZE)
/*
Those structures cannot be used to initialize the MVE version of the FFT F32 instances.
So they are not compiled when MVE is defined.
For the MVE version, the new arm_cfft_init_f16 must be used.
*/
#if !defined(__CC_ARM)
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_16) && defined(ARM_TABLE_BITREVIDX_FLT_16))
const arm_cfft_instance_f16 arm_cfft_sR_f16_len16 = {
16, twiddleCoefF16_16, armBitRevIndexTable_fixed_16, ARMBITREVINDEXTABLE_FIXED_16_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_32) && defined(ARM_TABLE_BITREVIDX_FLT_32))
const arm_cfft_instance_f16 arm_cfft_sR_f16_len32 = {
32, twiddleCoefF16_32, armBitRevIndexTable_fixed_32, ARMBITREVINDEXTABLE_FIXED_32_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_64) && defined(ARM_TABLE_BITREVIDX_FLT_64))
const arm_cfft_instance_f16 arm_cfft_sR_f16_len64 = {
64, twiddleCoefF16_64, armBitRevIndexTable_fixed_64, ARMBITREVINDEXTABLE_FIXED_64_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_128) && defined(ARM_TABLE_BITREVIDX_FLT_128))
const arm_cfft_instance_f16 arm_cfft_sR_f16_len128 = {
128, twiddleCoefF16_128, armBitRevIndexTable_fixed_128, ARMBITREVINDEXTABLE_FIXED_128_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_256) && defined(ARM_TABLE_BITREVIDX_FLT_256))
const arm_cfft_instance_f16 arm_cfft_sR_f16_len256 = {
256, twiddleCoefF16_256, armBitRevIndexTable_fixed_256, ARMBITREVINDEXTABLE_FIXED_256_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_512) && defined(ARM_TABLE_BITREVIDX_FLT_512))
const arm_cfft_instance_f16 arm_cfft_sR_f16_len512 = {
512, twiddleCoefF16_512, armBitRevIndexTable_fixed_512, ARMBITREVINDEXTABLE_FIXED_512_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_1024) && defined(ARM_TABLE_BITREVIDX_FLT_1024))
const arm_cfft_instance_f16 arm_cfft_sR_f16_len1024 = {
1024, twiddleCoefF16_1024, armBitRevIndexTable_fixed_1024, ARMBITREVINDEXTABLE_FIXED_1024_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_2048) && defined(ARM_TABLE_BITREVIDX_FLT_2048))
const arm_cfft_instance_f16 arm_cfft_sR_f16_len2048 = {
2048, twiddleCoefF16_2048, armBitRevIndexTable_fixed_2048, ARMBITREVINDEXTABLE_FIXED_2048_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_4096) && defined(ARM_TABLE_BITREVIDX_FLT_4096))
const arm_cfft_instance_f16 arm_cfft_sR_f16_len4096 = {
4096, twiddleCoefF16_4096, armBitRevIndexTable_fixed_4096, ARMBITREVINDEXTABLE_FIXED_4096_TABLE_LENGTH
};
#endif
#endif
#endif /* !defined(ARM_MATH_MVEF) || defined(ARM_MATH_AUTOVECTORIZE) */
#endif
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,66 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: CompexMathFunctions.c
* Description: Combination of all comlex math function source files.
*
* $Date: 18. March 2019
* $Revision: V1.0.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_cmplx_conj_f32.c"
#include "arm_cmplx_conj_q15.c"
#include "arm_cmplx_conj_q31.c"
#include "arm_cmplx_dot_prod_f32.c"
#include "arm_cmplx_dot_prod_q15.c"
#include "arm_cmplx_dot_prod_q31.c"
#include "arm_cmplx_mag_f32.c"
#include "arm_cmplx_mag_f64.c"
#if (defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEI)) && !defined(ARM_MATH_AUTOVECTORIZE)
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FAST_TABLES) || defined(ARM_TABLE_FAST_SQRT_Q31_MVE)
#include "arm_cmplx_mag_q15.c"
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FAST_TABLES) || defined(ARM_TABLE_FAST_SQRT_Q15_MVE)
#include "arm_cmplx_mag_fast_q15.c"
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FAST_TABLES) || defined(ARM_TABLE_FAST_SQRT_Q31_MVE)
#include "arm_cmplx_mag_q31.c"
#endif
#else
#include "arm_cmplx_mag_q15.c"
#include "arm_cmplx_mag_fast_q15.c"
#include "arm_cmplx_mag_q31.c"
#endif
#include "arm_cmplx_mag_squared_f32.c"
#include "arm_cmplx_mag_squared_f64.c"
#include "arm_cmplx_mag_squared_q15.c"
#include "arm_cmplx_mag_squared_q31.c"
#include "arm_cmplx_mult_cmplx_f32.c"
#include "arm_cmplx_mult_cmplx_f64.c"
#include "arm_cmplx_mult_cmplx_q15.c"
#include "arm_cmplx_mult_cmplx_q31.c"
#include "arm_cmplx_mult_real_f32.c"
#include "arm_cmplx_mult_real_q15.c"
#include "arm_cmplx_mult_real_q31.c"

View File

@@ -0,0 +1,32 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: CompexMathFunctionsF16.c
* Description: Combination of all complex math function f16 source files.
*
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2020 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_cmplx_conj_f16.c"
#include "arm_cmplx_dot_prod_f16.c"
#include "arm_cmplx_mag_f16.c"
#include "arm_cmplx_mag_squared_f16.c"
#include "arm_cmplx_mult_cmplx_f16.c"
#include "arm_cmplx_mult_real_f16.c"

View File

@@ -0,0 +1,185 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_cmplx_conj_f16.c
* Description: Floating-point complex conjugate
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/complex_math_functions_f16.h"
#if defined(ARM_FLOAT16_SUPPORTED)
/**
@ingroup groupCmplxMath
*/
/**
@defgroup cmplx_conj Complex Conjugate
Conjugates the elements of a complex data vector.
The <code>pSrc</code> points to the source data and
<code>pDst</code> points to the destination data where the result should be written.
<code>numSamples</code> specifies the number of complex samples
and the data in each array is stored in an interleaved fashion
(real, imag, real, imag, ...).
Each array has a total of <code>2*numSamples</code> values.
The underlying algorithm is used:
<pre>
for (n = 0; n < numSamples; n++) {
pDst[(2*n) ] = pSrc[(2*n) ]; // real part
pDst[(2*n)+1] = -pSrc[(2*n)+1]; // imag part
}
</pre>
There are separate functions for floating-point, Q15, and Q31 data types.
*/
/**
@addtogroup cmplx_conj
@{
*/
/**
@brief Floating-point complex conjugate.
@param[in] pSrc points to the input vector
@param[out] pDst points to the output vector
@param[in] numSamples number of samples in each vector
@return none
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
void arm_cmplx_conj_f16(
const float16_t * pSrc,
float16_t * pDst,
uint32_t numSamples)
{
static const float16_t cmplx_conj_sign[8] = { 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f };
uint32_t blockSize = numSamples * CMPLX_DIM; /* loop counters */
uint32_t blkCnt;
f16x8_t vecSrc;
f16x8_t vecSign;
/*
* load sign vector
*/
vecSign = *(f16x8_t *) cmplx_conj_sign;
/* Compute 4 real samples at a time */
blkCnt = blockSize >> 3U;
while (blkCnt > 0U)
{
vecSrc = vld1q(pSrc);
vst1q(pDst,vmulq(vecSrc, vecSign));
/*
* Decrement the blkCnt loop counter
* Advance vector source and destination pointers
*/
pSrc += 8;
pDst += 8;
blkCnt--;
}
/* Tail */
blkCnt = (blockSize & 0x7) >> 1;
while (blkCnt > 0U)
{
/* C[0] + jC[1] = A[0]+ j(-1)A[1] */
/* Calculate Complex Conjugate and store result in destination buffer. */
*pDst++ = *pSrc++;
*pDst++ = -(_Float16)*pSrc++;
/* Decrement loop counter */
blkCnt--;
}
}
#else
void arm_cmplx_conj_f16(
const float16_t * pSrc,
float16_t * pDst,
uint32_t numSamples)
{
uint32_t blkCnt; /* Loop counter */
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C[0] + jC[1] = A[0]+ j(-1)A[1] */
/* Calculate Complex Conjugate and store result in destination buffer. */
*pDst++ = *pSrc++;
*pDst++ = -(_Float16)*pSrc++;
*pDst++ = *pSrc++;
*pDst++ = -(_Float16)*pSrc++;
*pDst++ = *pSrc++;
*pDst++ = -(_Float16)*pSrc++;
*pDst++ = *pSrc++;
*pDst++ = -(_Float16)*pSrc++;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C[0] + jC[1] = A[0]+ j(-1)A[1] */
/* Calculate Complex Conjugate and store result in destination buffer. */
*pDst++ = *pSrc++;
*pDst++ = -(_Float16)*pSrc++;
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of cmplx_conj group
*/
#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */

View File

@@ -0,0 +1,213 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_cmplx_conj_f32.c
* Description: Floating-point complex conjugate
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/complex_math_functions.h"
/**
@ingroup groupCmplxMath
*/
/**
@defgroup cmplx_conj Complex Conjugate
Conjugates the elements of a complex data vector.
The <code>pSrc</code> points to the source data and
<code>pDst</code> points to the destination data where the result should be written.
<code>numSamples</code> specifies the number of complex samples
and the data in each array is stored in an interleaved fashion
(real, imag, real, imag, ...).
Each array has a total of <code>2*numSamples</code> values.
The underlying algorithm is used:
<pre>
for (n = 0; n < numSamples; n++) {
pDst[(2*n) ] = pSrc[(2*n) ]; // real part
pDst[(2*n)+1] = -pSrc[(2*n)+1]; // imag part
}
</pre>
There are separate functions for floating-point, Q15, and Q31 data types.
*/
/**
@addtogroup cmplx_conj
@{
*/
/**
@brief Floating-point complex conjugate.
@param[in] pSrc points to the input vector
@param[out] pDst points to the output vector
@param[in] numSamples number of samples in each vector
@return none
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
void arm_cmplx_conj_f32(
const float32_t * pSrc,
float32_t * pDst,
uint32_t numSamples)
{
static const float32_t cmplx_conj_sign[4] = { 1.0f, -1.0f, 1.0f, -1.0f };
uint32_t blockSize = numSamples * CMPLX_DIM; /* loop counters */
uint32_t blkCnt;
f32x4_t vecSrc;
f32x4_t vecSign;
/*
* load sign vector
*/
vecSign = *(f32x4_t *) cmplx_conj_sign;
/* Compute 4 real samples at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
vecSrc = vld1q(pSrc);
vst1q(pDst,vmulq(vecSrc, vecSign));
/*
* Decrement the blkCnt loop counter
* Advance vector source and destination pointers
*/
pSrc += 4;
pDst += 4;
blkCnt--;
}
/* Tail */
blkCnt = (blockSize & 0x3) >> 1;
while (blkCnt > 0U)
{
/* C[0] + jC[1] = A[0]+ j(-1)A[1] */
/* Calculate Complex Conjugate and store result in destination buffer. */
*pDst++ = *pSrc++;
*pDst++ = -*pSrc++;
/* Decrement loop counter */
blkCnt--;
}
}
#else
void arm_cmplx_conj_f32(
const float32_t * pSrc,
float32_t * pDst,
uint32_t numSamples)
{
uint32_t blkCnt; /* Loop counter */
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
float32x4_t zero;
float32x4x2_t vec;
zero = vdupq_n_f32(0.0f);
/* Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C[0]+jC[1] = A[0]+(-1)*jA[1] */
/* Calculate Complex Conjugate and then store the results in the destination buffer. */
vec = vld2q_f32(pSrc);
vec.val[1] = vsubq_f32(zero,vec.val[1]);
vst2q_f32(pDst,vec);
/* Increment pointers */
pSrc += 8;
pDst += 8;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = numSamples & 0x3;
#else
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C[0] + jC[1] = A[0]+ j(-1)A[1] */
/* Calculate Complex Conjugate and store result in destination buffer. */
*pDst++ = *pSrc++;
*pDst++ = -*pSrc++;
*pDst++ = *pSrc++;
*pDst++ = -*pSrc++;
*pDst++ = *pSrc++;
*pDst++ = -*pSrc++;
*pDst++ = *pSrc++;
*pDst++ = -*pSrc++;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
#endif /* #if defined (ARM_MATH_NEON) */
while (blkCnt > 0U)
{
/* C[0] + jC[1] = A[0]+ j(-1)A[1] */
/* Calculate Complex Conjugate and store result in destination buffer. */
*pDst++ = *pSrc++;
*pDst++ = -*pSrc++;
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of cmplx_conj group
*/

View File

@@ -0,0 +1,207 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_cmplx_conj_q15.c
* Description: Q15 complex conjugate
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/complex_math_functions.h"
/**
@ingroup groupCmplxMath
*/
/**
@addtogroup cmplx_conj
@{
*/
/**
@brief Q15 complex conjugate.
@param[in] pSrc points to the input vector
@param[out] pDst points to the output vector
@param[in] numSamples number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
The Q15 value -1 (0x8000) is saturated to the maximum allowable positive value 0x7FFF.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
void arm_cmplx_conj_q15(
const q15_t * pSrc,
q15_t * pDst,
uint32_t numSamples)
{
uint32_t blockSize = numSamples * CMPLX_DIM; /* loop counters */
uint32_t blkCnt;
q31_t in1;
q15x8x2_t vecSrc;
q15x8_t zero;
zero = vdupq_n_s16(0);
/* Compute 8 real samples at a time */
blkCnt = blockSize >> 4U;
while (blkCnt > 0U)
{
vecSrc = vld2q(pSrc);
vecSrc.val[1] = vqsubq(zero, vecSrc.val[1]);
vst2q(pDst,vecSrc);
/*
* Decrement the blkCnt loop counter
* Advance vector source and destination pointers
*/
pSrc += 16;
pDst += 16;
blkCnt --;
}
/* Tail */
blkCnt = (blockSize & 0xF) >> 1;
while (blkCnt > 0U)
{
/* C[0] + jC[1] = A[0]+ j(-1)A[1] */
/* Calculate Complex Conjugate and store result in destination buffer. */
*pDst++ = *pSrc++;
in1 = *pSrc++;
*pDst++ = __SSAT(-in1, 16);
/* Decrement loop counter */
blkCnt--;
}
}
#else
void arm_cmplx_conj_q15(
const q15_t * pSrc,
q15_t * pDst,
uint32_t numSamples)
{
uint32_t blkCnt; /* Loop counter */
q31_t in1; /* Temporary input variable */
#if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP)
q31_t in2, in3, in4; /* Temporary input variables */
#endif
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C[0] + jC[1] = A[0]+ j(-1)A[1] */
/* Calculate Complex Conjugate and store result in destination buffer. */
#if defined (ARM_MATH_DSP)
in1 = read_q15x2_ia (&pSrc);
in2 = read_q15x2_ia (&pSrc);
in3 = read_q15x2_ia (&pSrc);
in4 = read_q15x2_ia (&pSrc);
#ifndef ARM_MATH_BIG_ENDIAN
in1 = __QASX(0, in1);
in2 = __QASX(0, in2);
in3 = __QASX(0, in3);
in4 = __QASX(0, in4);
#else
in1 = __QSAX(0, in1);
in2 = __QSAX(0, in2);
in3 = __QSAX(0, in3);
in4 = __QSAX(0, in4);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
in1 = ((uint32_t) in1 >> 16) | ((uint32_t) in1 << 16);
in2 = ((uint32_t) in2 >> 16) | ((uint32_t) in2 << 16);
in3 = ((uint32_t) in3 >> 16) | ((uint32_t) in3 << 16);
in4 = ((uint32_t) in4 >> 16) | ((uint32_t) in4 << 16);
write_q15x2_ia (&pDst, in1);
write_q15x2_ia (&pDst, in2);
write_q15x2_ia (&pDst, in3);
write_q15x2_ia (&pDst, in4);
#else
*pDst++ = *pSrc++;
in1 = *pSrc++;
*pDst++ = (in1 == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in1;
*pDst++ = *pSrc++;
in1 = *pSrc++;
*pDst++ = (in1 == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in1;
*pDst++ = *pSrc++;
in1 = *pSrc++;
*pDst++ = (in1 == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in1;
*pDst++ = *pSrc++;
in1 = *pSrc++;
*pDst++ = (in1 == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in1;
#endif /* #if defined (ARM_MATH_DSP) */
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C[0] + jC[1] = A[0]+ j(-1)A[1] */
/* Calculate Complex Conjugate and store result in destination buffer. */
*pDst++ = *pSrc++;
in1 = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = __SSAT(-in1, 16);
#else
*pDst++ = (in1 == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in1;
#endif
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of cmplx_conj group
*/

View File

@@ -0,0 +1,193 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_cmplx_conj_q31.c
* Description: Q31 complex conjugate
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/complex_math_functions.h"
/**
@ingroup groupCmplxMath
*/
/**
@addtogroup cmplx_conj
@{
*/
/**
@brief Q31 complex conjugate.
@param[in] pSrc points to the input vector
@param[out] pDst points to the output vector
@param[in] numSamples number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
The Q31 value -1 (0x80000000) is saturated to the maximum allowable positive value 0x7FFFFFFF.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
void arm_cmplx_conj_q31(
const q31_t * pSrc,
q31_t * pDst,
uint32_t numSamples)
{
uint32_t blockSize = numSamples * CMPLX_DIM; /* loop counters */
uint32_t blkCnt;
q31x4x2_t vecSrc;
q31_t in; /* Temporary input variable */
q31x4_t zero;
zero = vdupq_n_s32(0);
/* Compute 4 real samples at a time */
blkCnt = blockSize >> 3U;
while (blkCnt > 0U)
{
vecSrc = vld2q(pSrc);
vecSrc.val[1] = vqsubq(zero, vecSrc.val[1]);
vst2q(pDst,vecSrc);
/*
* Decrement the blkCnt loop counter
* Advance vector source and destination pointers
*/
pSrc += 8;
pDst += 8;
blkCnt --;
}
/* Tail */
blkCnt = (blockSize & 0x7) >> 1;
while (blkCnt > 0U)
{
/* C[0] + jC[1] = A[0]+ j(-1)A[1] */
/* Calculate Complex Conjugate and store result in destination buffer. */
*pDst++ = *pSrc++;
in = *pSrc++;
*pDst++ = __QSUB(0, in);
/* Decrement loop counter */
blkCnt--;
}
}
#else
void arm_cmplx_conj_q31(
const q31_t * pSrc,
q31_t * pDst,
uint32_t numSamples)
{
uint32_t blkCnt; /* Loop counter */
q31_t in; /* Temporary input variable */
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C[0] + jC[1] = A[0]+ j(-1)A[1] */
/* Calculate Complex Conjugate and store result in destination buffer. */
*pDst++ = *pSrc++;
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = __QSUB(0, in);
#else
*pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
#endif
*pDst++ = *pSrc++;
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = __QSUB(0, in);
#else
*pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
#endif
*pDst++ = *pSrc++;
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = __QSUB(0, in);
#else
*pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
#endif
*pDst++ = *pSrc++;
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = __QSUB(0, in);
#else
*pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
#endif
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C[0] + jC[1] = A[0]+ j(-1)A[1] */
/* Calculate Complex Conjugate and store result in destination buffer. */
*pDst++ = *pSrc++;
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = __QSUB(0, in);
#else
*pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
#endif
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of cmplx_conj group
*/

View File

@@ -0,0 +1,288 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_cmplx_dot_prod_f16.c
* Description: Floating-point complex dot product
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/complex_math_functions_f16.h"
#if defined(ARM_FLOAT16_SUPPORTED)
/**
@ingroup groupCmplxMath
*/
/**
@defgroup cmplx_dot_prod Complex Dot Product
Computes the dot product of two complex vectors.
The vectors are multiplied element-by-element and then summed.
The <code>pSrcA</code> points to the first complex input vector and
<code>pSrcB</code> points to the second complex input vector.
<code>numSamples</code> specifies the number of complex samples
and the data in each array is stored in an interleaved fashion
(real, imag, real, imag, ...).
Each array has a total of <code>2*numSamples</code> values.
The underlying algorithm is used:
<pre>
realResult = 0;
imagResult = 0;
for (n = 0; n < numSamples; n++) {
realResult += pSrcA[(2*n)+0] * pSrcB[(2*n)+0] - pSrcA[(2*n)+1] * pSrcB[(2*n)+1];
imagResult += pSrcA[(2*n)+0] * pSrcB[(2*n)+1] + pSrcA[(2*n)+1] * pSrcB[(2*n)+0];
}
</pre>
There are separate functions for floating-point, Q15, and Q31 data types.
*/
/**
@addtogroup cmplx_dot_prod
@{
*/
/**
@brief Floating-point complex dot product.
@param[in] pSrcA points to the first input vector
@param[in] pSrcB points to the second input vector
@param[in] numSamples number of samples in each vector
@param[out] realResult real part of the result returned here
@param[out] imagResult imaginary part of the result returned here
@return none
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_cmplx_dot_prod_f16(
const float16_t * pSrcA,
const float16_t * pSrcB,
uint32_t numSamples,
float16_t * realResult,
float16_t * imagResult)
{
int32_t blkCnt;
float16_t real_sum, imag_sum;
f16x8_t vecSrcA, vecSrcB;
f16x8_t vec_acc = vdupq_n_f16(0.0f16);
f16x8_t vecSrcC, vecSrcD;
blkCnt = (numSamples >> 3);
blkCnt -= 1;
if (blkCnt > 0) {
/* should give more freedom to generate stall free code */
vecSrcA = vld1q( pSrcA);
vecSrcB = vld1q( pSrcB);
pSrcA += 8;
pSrcB += 8;
while (blkCnt > 0) {
vec_acc = vcmlaq(vec_acc, vecSrcA, vecSrcB);
vecSrcC = vld1q(pSrcA);
pSrcA += 8;
vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB);
vecSrcD = vld1q(pSrcB);
pSrcB += 8;
vec_acc = vcmlaq(vec_acc, vecSrcC, vecSrcD);
vecSrcA = vld1q(pSrcA);
pSrcA += 8;
vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD);
vecSrcB = vld1q(pSrcB);
pSrcB += 8;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/* process last elements out of the loop avoid the armclang breaking the SW pipeline */
vec_acc = vcmlaq(vec_acc, vecSrcA, vecSrcB);
vecSrcC = vld1q(pSrcA);
vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB);
vecSrcD = vld1q(pSrcB);
vec_acc = vcmlaq(vec_acc, vecSrcC, vecSrcD);
vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD);
/*
* tail
*/
blkCnt = CMPLX_DIM * (numSamples & 7);
while (blkCnt > 0) {
mve_pred16_t p = vctp16q(blkCnt);
pSrcA += 8;
pSrcB += 8;
vecSrcA = vldrhq_z_f16(pSrcA, p);
vecSrcB = vldrhq_z_f16(pSrcB, p);
vec_acc = vcmlaq_m(vec_acc, vecSrcA, vecSrcB, p);
vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p);
blkCnt -= 8;
}
} else {
/* small vector */
blkCnt = numSamples * CMPLX_DIM;
vec_acc = vdupq_n_f16(0.0f16);
do {
mve_pred16_t p = vctp16q(blkCnt);
vecSrcA = vldrhq_z_f16(pSrcA, p);
vecSrcB = vldrhq_z_f16(pSrcB, p);
vec_acc = vcmlaq_m(vec_acc, vecSrcA, vecSrcB, p);
vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p);
/*
* Decrement the blkCnt loop counter
* Advance vector source and destination pointers
*/
pSrcA += 8;
pSrcB += 8;
blkCnt -= 8;
}
while (blkCnt > 0);
}
/* Sum the partial parts */
mve_cmplx_sum_intra_r_i_f16(vec_acc, real_sum, imag_sum);
/*
* Store the real and imaginary results in the destination buffers
*/
*realResult = real_sum;
*imagResult = imag_sum;
}
#else
void arm_cmplx_dot_prod_f16(
const float16_t * pSrcA,
const float16_t * pSrcB,
uint32_t numSamples,
float16_t * realResult,
float16_t * imagResult)
{
uint32_t blkCnt; /* Loop counter */
_Float16 real_sum = 0.0f, imag_sum = 0.0f; /* Temporary result variables */
_Float16 a0,b0,c0,d0;
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += a0 * c0;
imag_sum += a0 * d0;
real_sum -= b0 * d0;
imag_sum += b0 * c0;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += a0 * c0;
imag_sum += a0 * d0;
real_sum -= b0 * d0;
imag_sum += b0 * c0;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += a0 * c0;
imag_sum += a0 * d0;
real_sum -= b0 * d0;
imag_sum += b0 * c0;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += a0 * c0;
imag_sum += a0 * d0;
real_sum -= b0 * d0;
imag_sum += b0 * c0;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += a0 * c0;
imag_sum += a0 * d0;
real_sum -= b0 * d0;
imag_sum += b0 * c0;
/* Decrement loop counter */
blkCnt--;
}
/* Store real and imaginary result in destination buffer. */
*realResult = real_sum;
*imagResult = imag_sum;
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of cmplx_dot_prod group
*/
#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */

View File

@@ -0,0 +1,340 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_cmplx_dot_prod_f32.c
* Description: Floating-point complex dot product
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/complex_math_functions.h"
/**
@ingroup groupCmplxMath
*/
/**
@defgroup cmplx_dot_prod Complex Dot Product
Computes the dot product of two complex vectors.
The vectors are multiplied element-by-element and then summed.
The <code>pSrcA</code> points to the first complex input vector and
<code>pSrcB</code> points to the second complex input vector.
<code>numSamples</code> specifies the number of complex samples
and the data in each array is stored in an interleaved fashion
(real, imag, real, imag, ...).
Each array has a total of <code>2*numSamples</code> values.
The underlying algorithm is used:
<pre>
realResult = 0;
imagResult = 0;
for (n = 0; n < numSamples; n++) {
realResult += pSrcA[(2*n)+0] * pSrcB[(2*n)+0] - pSrcA[(2*n)+1] * pSrcB[(2*n)+1];
imagResult += pSrcA[(2*n)+0] * pSrcB[(2*n)+1] + pSrcA[(2*n)+1] * pSrcB[(2*n)+0];
}
</pre>
There are separate functions for floating-point, Q15, and Q31 data types.
*/
/**
@addtogroup cmplx_dot_prod
@{
*/
/**
@brief Floating-point complex dot product.
@param[in] pSrcA points to the first input vector
@param[in] pSrcB points to the second input vector
@param[in] numSamples number of samples in each vector
@param[out] realResult real part of the result returned here
@param[out] imagResult imaginary part of the result returned here
@return none
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
void arm_cmplx_dot_prod_f32(
const float32_t * pSrcA,
const float32_t * pSrcB,
uint32_t numSamples,
float32_t * realResult,
float32_t * imagResult)
{
int32_t blkCnt;
float32_t real_sum, imag_sum;
f32x4_t vecSrcA, vecSrcB;
f32x4_t vec_acc = vdupq_n_f32(0.0f);
f32x4_t vecSrcC, vecSrcD;
blkCnt = numSamples >> 2;
blkCnt -= 1;
if (blkCnt > 0) {
/* should give more freedom to generate stall free code */
vecSrcA = vld1q(pSrcA);
vecSrcB = vld1q(pSrcB);
pSrcA += 4;
pSrcB += 4;
while (blkCnt > 0) {
vec_acc = vcmlaq(vec_acc, vecSrcA, vecSrcB);
vecSrcC = vld1q(pSrcA);
pSrcA += 4;
vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB);
vecSrcD = vld1q(pSrcB);
pSrcB += 4;
vec_acc = vcmlaq(vec_acc, vecSrcC, vecSrcD);
vecSrcA = vld1q(pSrcA);
pSrcA += 4;
vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD);
vecSrcB = vld1q(pSrcB);
pSrcB += 4;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/* process last elements out of the loop avoid the armclang breaking the SW pipeline */
vec_acc = vcmlaq(vec_acc, vecSrcA, vecSrcB);
vecSrcC = vld1q(pSrcA);
vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB);
vecSrcD = vld1q(pSrcB);
vec_acc = vcmlaq(vec_acc, vecSrcC, vecSrcD);
vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD);
/*
* tail
*/
blkCnt = CMPLX_DIM * (numSamples & 3);
while (blkCnt > 0) {
mve_pred16_t p = vctp32q(blkCnt);
pSrcA += 4;
pSrcB += 4;
vecSrcA = vldrwq_z_f32(pSrcA, p);
vecSrcB = vldrwq_z_f32(pSrcB, p);
vec_acc = vcmlaq_m(vec_acc, vecSrcA, vecSrcB, p);
vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p);
blkCnt -= 4;
}
} else {
/* small vector */
blkCnt = numSamples * CMPLX_DIM;
vec_acc = vdupq_n_f32(0.0f);
do {
mve_pred16_t p = vctp32q(blkCnt);
vecSrcA = vldrwq_z_f32(pSrcA, p);
vecSrcB = vldrwq_z_f32(pSrcB, p);
vec_acc = vcmlaq_m(vec_acc, vecSrcA, vecSrcB, p);
vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p);
/*
* Decrement the blkCnt loop counter
* Advance vector source and destination pointers
*/
pSrcA += 4;
pSrcB += 4;
blkCnt -= 4;
}
while (blkCnt > 0);
}
real_sum = vgetq_lane(vec_acc, 0) + vgetq_lane(vec_acc, 2);
imag_sum = vgetq_lane(vec_acc, 1) + vgetq_lane(vec_acc, 3);
/*
* Store the real and imaginary results in the destination buffers
*/
*realResult = real_sum;
*imagResult = imag_sum;
}
#else
void arm_cmplx_dot_prod_f32(
const float32_t * pSrcA,
const float32_t * pSrcB,
uint32_t numSamples,
float32_t * realResult,
float32_t * imagResult)
{
uint32_t blkCnt; /* Loop counter */
float32_t real_sum = 0.0f, imag_sum = 0.0f; /* Temporary result variables */
float32_t a0,b0,c0,d0;
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
float32x4x2_t vec1,vec2,vec3,vec4;
float32x4_t accR,accI;
float32x2_t accum = vdup_n_f32(0);
accR = vdupq_n_f32(0.0f);
accI = vdupq_n_f32(0.0f);
/* Loop unrolling: Compute 8 outputs at a time */
blkCnt = numSamples >> 3U;
while (blkCnt > 0U)
{
/* C = (A[0]+jA[1])*(B[0]+jB[1]) + ... */
/* Calculate dot product and then store the result in a temporary buffer. */
vec1 = vld2q_f32(pSrcA);
vec2 = vld2q_f32(pSrcB);
/* Increment pointers */
pSrcA += 8;
pSrcB += 8;
/* Re{C} = Re{A}*Re{B} - Im{A}*Im{B} */
accR = vmlaq_f32(accR,vec1.val[0],vec2.val[0]);
accR = vmlsq_f32(accR,vec1.val[1],vec2.val[1]);
/* Im{C} = Re{A}*Im{B} + Im{A}*Re{B} */
accI = vmlaq_f32(accI,vec1.val[1],vec2.val[0]);
accI = vmlaq_f32(accI,vec1.val[0],vec2.val[1]);
vec3 = vld2q_f32(pSrcA);
vec4 = vld2q_f32(pSrcB);
/* Increment pointers */
pSrcA += 8;
pSrcB += 8;
/* Re{C} = Re{A}*Re{B} - Im{A}*Im{B} */
accR = vmlaq_f32(accR,vec3.val[0],vec4.val[0]);
accR = vmlsq_f32(accR,vec3.val[1],vec4.val[1]);
/* Im{C} = Re{A}*Im{B} + Im{A}*Re{B} */
accI = vmlaq_f32(accI,vec3.val[1],vec4.val[0]);
accI = vmlaq_f32(accI,vec3.val[0],vec4.val[1]);
/* Decrement the loop counter */
blkCnt--;
}
accum = vpadd_f32(vget_low_f32(accR), vget_high_f32(accR));
real_sum += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
accum = vpadd_f32(vget_low_f32(accI), vget_high_f32(accI));
imag_sum += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
/* Tail */
blkCnt = numSamples & 0x7;
#else
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += a0 * c0;
imag_sum += a0 * d0;
real_sum -= b0 * d0;
imag_sum += b0 * c0;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += a0 * c0;
imag_sum += a0 * d0;
real_sum -= b0 * d0;
imag_sum += b0 * c0;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += a0 * c0;
imag_sum += a0 * d0;
real_sum -= b0 * d0;
imag_sum += b0 * c0;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += a0 * c0;
imag_sum += a0 * d0;
real_sum -= b0 * d0;
imag_sum += b0 * c0;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
#endif /* #if defined(ARM_MATH_NEON) */
while (blkCnt > 0U)
{
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += a0 * c0;
imag_sum += a0 * d0;
real_sum -= b0 * d0;
imag_sum += b0 * c0;
/* Decrement loop counter */
blkCnt--;
}
/* Store real and imaginary result in destination buffer. */
*realResult = real_sum;
*imagResult = imag_sum;
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of cmplx_dot_prod group
*/

View File

@@ -0,0 +1,256 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_cmplx_dot_prod_q15.c
* Description: Processing function for the Q15 Complex Dot product
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/complex_math_functions.h"
/**
@ingroup groupCmplxMath
*/
/**
@addtogroup cmplx_dot_prod
@{
*/
/**
@brief Q15 complex dot product.
@param[in] pSrcA points to the first input vector
@param[in] pSrcB points to the second input vector
@param[in] numSamples number of samples in each vector
@param[out] realResult real part of the result returned here
@param[out] imagResult imaginary part of the result returned her
@return none
@par Scaling and Overflow Behavior
The function is implemented using an internal 64-bit accumulator.
The intermediate 1.15 by 1.15 multiplications are performed with full precision and yield a 2.30 result.
These are accumulated in a 64-bit accumulator with 34.30 precision.
As a final step, the accumulators are converted to 8.24 format.
The return results <code>realResult</code> and <code>imagResult</code> are in 8.24 format.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
void arm_cmplx_dot_prod_q15(
const q15_t * pSrcA,
const q15_t * pSrcB,
uint32_t numSamples,
q31_t * realResult,
q31_t * imagResult)
{
int32_t blkCnt;
q63_t accReal = 0LL;
q63_t accImag = 0LL;
q15x8_t vecSrcA, vecSrcB;
q15x8_t vecSrcC, vecSrcD;
blkCnt = (numSamples >> 3);
blkCnt -= 1;
if (blkCnt > 0) {
/* should give more freedom to generate stall free code */
vecSrcA = vld1q(pSrcA);
vecSrcB = vld1q(pSrcB);
pSrcA += 8;
pSrcB += 8;
while (blkCnt > 0) {
accReal = vmlsldavaq(accReal, vecSrcA, vecSrcB);
vecSrcC = vld1q(pSrcA);
pSrcA += 8;
accImag = vmlaldavaxq(accImag, vecSrcA, vecSrcB);
vecSrcD = vld1q(pSrcB);
pSrcB += 8;
accReal = vmlsldavaq(accReal, vecSrcC, vecSrcD);
vecSrcA = vld1q(pSrcA);
pSrcA += 8;
accImag = vmlaldavaxq(accImag, vecSrcC, vecSrcD);
vecSrcB = vld1q(pSrcB);
pSrcB += 8;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/* process last elements out of the loop avoid the armclang breaking the SW pipeline */
accReal = vmlsldavaq(accReal, vecSrcA, vecSrcB);
vecSrcC = vld1q(pSrcA);
accImag = vmlaldavaxq(accImag, vecSrcA, vecSrcB);
vecSrcD = vld1q(pSrcB);
accReal = vmlsldavaq(accReal, vecSrcC, vecSrcD);
vecSrcA = vld1q(pSrcA);
accImag = vmlaldavaxq(accImag, vecSrcC, vecSrcD);
vecSrcB = vld1q(pSrcB);
/*
* tail
*/
blkCnt = CMPLX_DIM * (numSamples & 7);
do {
mve_pred16_t p = vctp16q(blkCnt);
pSrcA += 8;
pSrcB += 8;
vecSrcA = vldrhq_z_s16(pSrcA, p);
vecSrcB = vldrhq_z_s16(pSrcB, p);
accReal = vmlsldavaq_p(accReal, vecSrcA, vecSrcB, p);
accImag = vmlaldavaxq_p(accImag, vecSrcA, vecSrcB, p);
blkCnt -= 8;
}
while ((int32_t) blkCnt > 0);
} else {
blkCnt = numSamples * CMPLX_DIM;
while (blkCnt > 0) {
mve_pred16_t p = vctp16q(blkCnt);
vecSrcA = vldrhq_z_s16(pSrcA, p);
vecSrcB = vldrhq_z_s16(pSrcB, p);
accReal = vmlsldavaq_p(accReal, vecSrcA, vecSrcB, p);
accImag = vmlaldavaxq_p(accImag, vecSrcA, vecSrcB, p);
/*
* Decrement the blkCnt loop counter
* Advance vector source and destination pointers
*/
pSrcA += 8;
pSrcB += 8;
blkCnt -= 8;
}
}
*realResult = asrl(accReal, (14 - 8));
*imagResult = asrl(accImag, (14 - 8));
}
#else
void arm_cmplx_dot_prod_q15(
const q15_t * pSrcA,
const q15_t * pSrcB,
uint32_t numSamples,
q31_t * realResult,
q31_t * imagResult)
{
uint32_t blkCnt; /* Loop counter */
q63_t real_sum = 0, imag_sum = 0; /* Temporary result variables */
q15_t a0,b0,c0,d0;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += (q31_t)a0 * c0;
imag_sum += (q31_t)a0 * d0;
real_sum -= (q31_t)b0 * d0;
imag_sum += (q31_t)b0 * c0;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += (q31_t)a0 * c0;
imag_sum += (q31_t)a0 * d0;
real_sum -= (q31_t)b0 * d0;
imag_sum += (q31_t)b0 * c0;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += (q31_t)a0 * c0;
imag_sum += (q31_t)a0 * d0;
real_sum -= (q31_t)b0 * d0;
imag_sum += (q31_t)b0 * c0;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += (q31_t)a0 * c0;
imag_sum += (q31_t)a0 * d0;
real_sum -= (q31_t)b0 * d0;
imag_sum += (q31_t)b0 * c0;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += (q31_t)a0 * c0;
imag_sum += (q31_t)a0 * d0;
real_sum -= (q31_t)b0 * d0;
imag_sum += (q31_t)b0 * c0;
/* Decrement loop counter */
blkCnt--;
}
/* Store real and imaginary result in 8.24 format */
/* Convert real data in 34.30 to 8.24 by 6 right shifts */
*realResult = (q31_t) (real_sum >> 6);
/* Convert imaginary data in 34.30 to 8.24 by 6 right shifts */
*imagResult = (q31_t) (imag_sum >> 6);
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of cmplx_dot_prod group
*/

View File

@@ -0,0 +1,259 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_cmplx_dot_prod_q31.c
* Description: Q31 complex dot product
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/complex_math_functions.h"
/**
@ingroup groupCmplxMath
*/
/**
@addtogroup cmplx_dot_prod
@{
*/
/**
@brief Q31 complex dot product.
@param[in] pSrcA points to the first input vector
@param[in] pSrcB points to the second input vector
@param[in] numSamples number of samples in each vector
@param[out] realResult real part of the result returned here
@param[out] imagResult imaginary part of the result returned here
@return none
@par Scaling and Overflow Behavior
The function is implemented using an internal 64-bit accumulator.
The intermediate 1.31 by 1.31 multiplications are performed with 64-bit precision and then shifted to 16.48 format.
The internal real and imaginary accumulators are in 16.48 format and provide 15 guard bits.
Additions are nonsaturating and no overflow will occur as long as <code>numSamples</code> is less than 32768.
The return results <code>realResult</code> and <code>imagResult</code> are in 16.48 format.
Input down scaling is not required.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
void arm_cmplx_dot_prod_q31(
const q31_t * pSrcA,
const q31_t * pSrcB,
uint32_t numSamples,
q63_t * realResult,
q63_t * imagResult)
{
int32_t blkCnt;
q63_t accReal = 0LL;
q63_t accImag = 0LL;
q31x4_t vecSrcA, vecSrcB;
q31x4_t vecSrcC, vecSrcD;
blkCnt = numSamples >> 2;
blkCnt -= 1;
if (blkCnt > 0) {
/* should give more freedom to generate stall free code */
vecSrcA = vld1q(pSrcA);
vecSrcB = vld1q(pSrcB);
pSrcA += 4;
pSrcB += 4;
while (blkCnt > 0) {
accReal = vrmlsldavhaq(accReal, vecSrcA, vecSrcB);
vecSrcC = vld1q(pSrcA);
pSrcA += 4;
accImag = vrmlaldavhaxq(accImag, vecSrcA, vecSrcB);
vecSrcD = vld1q(pSrcB);
pSrcB += 4;
accReal = vrmlsldavhaq(accReal, vecSrcC, vecSrcD);
vecSrcA = vld1q(pSrcA);
pSrcA += 4;
accImag = vrmlaldavhaxq(accImag, vecSrcC, vecSrcD);
vecSrcB = vld1q(pSrcB);
pSrcB += 4;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/* process last elements out of the loop avoid the armclang breaking the SW pipeline */
accReal = vrmlsldavhaq(accReal, vecSrcA, vecSrcB);
vecSrcC = vld1q(pSrcA);
accImag = vrmlaldavhaxq(accImag, vecSrcA, vecSrcB);
vecSrcD = vld1q(pSrcB);
accReal = vrmlsldavhaq(accReal, vecSrcC, vecSrcD);
vecSrcA = vld1q(pSrcA);
accImag = vrmlaldavhaxq(accImag, vecSrcC, vecSrcD);
vecSrcB = vld1q(pSrcB);
/*
* tail
*/
blkCnt = CMPLX_DIM * (numSamples & 3);
do {
mve_pred16_t p = vctp32q(blkCnt);
pSrcA += 4;
pSrcB += 4;
vecSrcA = vldrwq_z_s32(pSrcA, p);
vecSrcB = vldrwq_z_s32(pSrcB, p);
accReal = vrmlsldavhaq_p(accReal, vecSrcA, vecSrcB, p);
accImag = vrmlaldavhaxq_p(accImag, vecSrcA, vecSrcB, p);
blkCnt -= 4;
}
while ((int32_t) blkCnt > 0);
} else {
blkCnt = numSamples * CMPLX_DIM;
while (blkCnt > 0) {
mve_pred16_t p = vctp32q(blkCnt);
vecSrcA = vldrwq_z_s32(pSrcA, p);
vecSrcB = vldrwq_z_s32(pSrcB, p);
accReal = vrmlsldavhaq_p(accReal, vecSrcA, vecSrcB, p);
accImag = vrmlaldavhaxq_p(accImag, vecSrcA, vecSrcB, p);
/*
* Decrement the blkCnt loop counter
* Advance vector source and destination pointers
*/
pSrcA += 4;
pSrcB += 4;
blkCnt -= 4;
}
}
*realResult = asrl(accReal, (14 - 8));
*imagResult = asrl(accImag, (14 - 8));
}
#else
void arm_cmplx_dot_prod_q31(
const q31_t * pSrcA,
const q31_t * pSrcB,
uint32_t numSamples,
q63_t * realResult,
q63_t * imagResult)
{
uint32_t blkCnt; /* Loop counter */
q63_t real_sum = 0, imag_sum = 0; /* Temporary result variables */
q31_t a0,b0,c0,d0;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += ((q63_t)a0 * c0) >> 14;
imag_sum += ((q63_t)a0 * d0) >> 14;
real_sum -= ((q63_t)b0 * d0) >> 14;
imag_sum += ((q63_t)b0 * c0) >> 14;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += ((q63_t)a0 * c0) >> 14;
imag_sum += ((q63_t)a0 * d0) >> 14;
real_sum -= ((q63_t)b0 * d0) >> 14;
imag_sum += ((q63_t)b0 * c0) >> 14;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += ((q63_t)a0 * c0) >> 14;
imag_sum += ((q63_t)a0 * d0) >> 14;
real_sum -= ((q63_t)b0 * d0) >> 14;
imag_sum += ((q63_t)b0 * c0) >> 14;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += ((q63_t)a0 * c0) >> 14;
imag_sum += ((q63_t)a0 * d0) >> 14;
real_sum -= ((q63_t)b0 * d0) >> 14;
imag_sum += ((q63_t)b0 * c0) >> 14;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += ((q63_t)a0 * c0) >> 14;
imag_sum += ((q63_t)a0 * d0) >> 14;
real_sum -= ((q63_t)b0 * d0) >> 14;
imag_sum += ((q63_t)b0 * c0) >> 14;
/* Decrement loop counter */
blkCnt--;
}
/* Store real and imaginary result in 16.48 format */
*realResult = real_sum;
*imagResult = imag_sum;
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of cmplx_dot_prod group
*/

View File

@@ -0,0 +1,241 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_cmplx_mag_f16.c
* Description: Floating-point complex magnitude
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/complex_math_functions_f16.h"
#if defined(ARM_FLOAT16_SUPPORTED)
/**
@ingroup groupCmplxMath
*/
/**
@defgroup cmplx_mag Complex Magnitude
Computes the magnitude of the elements of a complex data vector.
The <code>pSrc</code> points to the source data and
<code>pDst</code> points to the where the result should be written.
<code>numSamples</code> specifies the number of complex samples
in the input array and the data is stored in an interleaved fashion
(real, imag, real, imag, ...).
The input array has a total of <code>2*numSamples</code> values;
the output array has a total of <code>numSamples</code> values.
The underlying algorithm is used:
<pre>
for (n = 0; n < numSamples; n++) {
pDst[n] = sqrt(pSrc[(2*n)+0]^2 + pSrc[(2*n)+1]^2);
}
</pre>
There are separate functions for floating-point, Q15, and Q31 data types.
*/
/**
@addtogroup cmplx_mag
@{
*/
/**
@brief Floating-point complex magnitude.
@param[in] pSrc points to input vector
@param[out] pDst points to output vector
@param[in] numSamples number of samples in each vector
@return none
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_cmplx_mag_f16(
const float16_t * pSrc,
float16_t * pDst,
uint32_t numSamples)
{
int32_t blockSize = numSamples; /* loop counters */
uint32_t blkCnt; /* loop counters */
f16x8x2_t vecSrc;
f16x8_t sum;
/* Compute 4 complex samples at a time */
blkCnt = blockSize >> 3;
while (blkCnt > 0U)
{
q15x8_t newtonStartVec;
f16x8_t sumHalf, invSqrt;
vecSrc = vld2q(pSrc);
pSrc += 16;
sum = vmulq(vecSrc.val[0], vecSrc.val[0]);
sum = vfmaq(sum, vecSrc.val[1], vecSrc.val[1]);
/*
* inlined Fast SQRT using inverse SQRT newton-raphson method
*/
/* compute initial value */
newtonStartVec = vdupq_n_s16(INVSQRT_MAGIC_F16) - vshrq((q15x8_t) sum, 1);
sumHalf = sum * 0.5f;
/*
* compute 3 x iterations
*
* The more iterations, the more accuracy.
* If you need to trade a bit of accuracy for more performance,
* you can comment out the 3rd use of the macro.
*/
INVSQRT_NEWTON_MVE_F16(invSqrt, sumHalf, (f16x8_t) newtonStartVec);
INVSQRT_NEWTON_MVE_F16(invSqrt, sumHalf, invSqrt);
INVSQRT_NEWTON_MVE_F16(invSqrt, sumHalf, invSqrt);
/*
* set negative values to 0
*/
invSqrt = vdupq_m(invSqrt, (float16_t)0.0f, vcmpltq(invSqrt, (float16_t)0.0f));
/*
* sqrt(x) = x * invSqrt(x)
*/
sum = vmulq(sum, invSqrt);
vstrhq_f16(pDst, sum);
pDst += 8;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
*/
blkCnt = blockSize & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
q15x8_t newtonStartVec;
f16x8_t sumHalf, invSqrt;
vecSrc = vld2q((float16_t const *)pSrc);
sum = vmulq(vecSrc.val[0], vecSrc.val[0]);
sum = vfmaq(sum, vecSrc.val[1], vecSrc.val[1]);
/*
* inlined Fast SQRT using inverse SQRT newton-raphson method
*/
/* compute initial value */
newtonStartVec = vdupq_n_s16(INVSQRT_MAGIC_F16) - vshrq((q15x8_t) sum, 1);
sumHalf = vmulq(sum, (float16_t)0.5);
/*
* compute 2 x iterations
*/
INVSQRT_NEWTON_MVE_F16(invSqrt, sumHalf, (f16x8_t) newtonStartVec);
INVSQRT_NEWTON_MVE_F16(invSqrt, sumHalf, invSqrt);
/*
* set negative values to 0
*/
invSqrt = vdupq_m(invSqrt, (float16_t)0.0, vcmpltq(invSqrt, (float16_t)0.0));
/*
* sqrt(x) = x * invSqrt(x)
*/
sum = vmulq(sum, invSqrt);
vstrhq_p_f16(pDst, sum, p0);
}
}
#else
void arm_cmplx_mag_f16(
const float16_t * pSrc,
float16_t * pDst,
uint32_t numSamples)
{
uint32_t blkCnt; /* loop counter */
_Float16 real, imag; /* Temporary variables to hold input values */
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
real = *pSrc++;
imag = *pSrc++;
/* store result in destination buffer. */
arm_sqrt_f16((real * real) + (imag * imag), pDst++);
real = *pSrc++;
imag = *pSrc++;
arm_sqrt_f16((real * real) + (imag * imag), pDst++);
real = *pSrc++;
imag = *pSrc++;
arm_sqrt_f16((real * real) + (imag * imag), pDst++);
real = *pSrc++;
imag = *pSrc++;
arm_sqrt_f16((real * real) + (imag * imag), pDst++);
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
real = *pSrc++;
imag = *pSrc++;
/* store result in destination buffer. */
arm_sqrt_f16((real * real) + (imag * imag), pDst++);
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of cmplx_mag group
*/
#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */

View File

@@ -0,0 +1,273 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_cmplx_mag_f32.c
* Description: Floating-point complex magnitude
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/complex_math_functions.h"
/**
@ingroup groupCmplxMath
*/
/**
@defgroup cmplx_mag Complex Magnitude
Computes the magnitude of the elements of a complex data vector.
The <code>pSrc</code> points to the source data and
<code>pDst</code> points to the where the result should be written.
<code>numSamples</code> specifies the number of complex samples
in the input array and the data is stored in an interleaved fashion
(real, imag, real, imag, ...).
The input array has a total of <code>2*numSamples</code> values;
the output array has a total of <code>numSamples</code> values.
The underlying algorithm is used:
<pre>
for (n = 0; n < numSamples; n++) {
pDst[n] = sqrt(pSrc[(2*n)+0]^2 + pSrc[(2*n)+1]^2);
}
</pre>
There are separate functions for floating-point, Q15, and Q31 data types.
*/
/**
@addtogroup cmplx_mag
@{
*/
/**
@brief Floating-point complex magnitude.
@param[in] pSrc points to input vector
@param[out] pDst points to output vector
@param[in] numSamples number of samples in each vector
@return none
*/
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_vec_math.h"
#endif
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_cmplx_mag_f32(
const float32_t * pSrc,
float32_t * pDst,
uint32_t numSamples)
{
int32_t blockSize = numSamples; /* loop counters */
uint32_t blkCnt; /* loop counters */
f32x4x2_t vecSrc;
f32x4_t sum;
float32_t real, imag; /* Temporary variables to hold input values */
/* Compute 4 complex samples at a time */
blkCnt = blockSize >> 2;
while (blkCnt > 0U)
{
q31x4_t newtonStartVec;
f32x4_t sumHalf, invSqrt;
vecSrc = vld2q(pSrc);
pSrc += 8;
sum = vmulq(vecSrc.val[0], vecSrc.val[0]);
sum = vfmaq(sum, vecSrc.val[1], vecSrc.val[1]);
/*
* inlined Fast SQRT using inverse SQRT newton-raphson method
*/
/* compute initial value */
newtonStartVec = vdupq_n_s32(INVSQRT_MAGIC_F32) - vshrq((q31x4_t) sum, 1);
sumHalf = sum * 0.5f;
/*
* compute 3 x iterations
*
* The more iterations, the more accuracy.
* If you need to trade a bit of accuracy for more performance,
* you can comment out the 3rd use of the macro.
*/
INVSQRT_NEWTON_MVE_F32(invSqrt, sumHalf, (f32x4_t) newtonStartVec);
INVSQRT_NEWTON_MVE_F32(invSqrt, sumHalf, invSqrt);
INVSQRT_NEWTON_MVE_F32(invSqrt, sumHalf, invSqrt);
/*
* set negative values to 0
*/
invSqrt = vdupq_m(invSqrt, 0.0f, vcmpltq(invSqrt, 0.0f));
/*
* sqrt(x) = x * invSqrt(x)
*/
sum = vmulq(sum, invSqrt);
vst1q(pDst, sum);
pDst += 4;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
*/
blkCnt = blockSize & 3;
while (blkCnt > 0U)
{
/* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
real = *pSrc++;
imag = *pSrc++;
/* store result in destination buffer. */
arm_sqrt_f32((real * real) + (imag * imag), pDst++);
/* Decrement loop counter */
blkCnt--;
}
}
#else
void arm_cmplx_mag_f32(
const float32_t * pSrc,
float32_t * pDst,
uint32_t numSamples)
{
uint32_t blkCnt; /* loop counter */
float32_t real, imag; /* Temporary variables to hold input values */
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
float32x4x2_t vecA;
float32x4_t vRealA;
float32x4_t vImagA;
float32x4_t vMagSqA;
float32x4x2_t vecB;
float32x4_t vRealB;
float32x4_t vImagB;
float32x4_t vMagSqB;
/* Loop unrolling: Compute 8 outputs at a time */
blkCnt = numSamples >> 3;
while (blkCnt > 0U)
{
/* out = sqrt((real * real) + (imag * imag)) */
vecA = vld2q_f32(pSrc);
pSrc += 8;
vecB = vld2q_f32(pSrc);
pSrc += 8;
vRealA = vmulq_f32(vecA.val[0], vecA.val[0]);
vImagA = vmulq_f32(vecA.val[1], vecA.val[1]);
vMagSqA = vaddq_f32(vRealA, vImagA);
vRealB = vmulq_f32(vecB.val[0], vecB.val[0]);
vImagB = vmulq_f32(vecB.val[1], vecB.val[1]);
vMagSqB = vaddq_f32(vRealB, vImagB);
/* Store the result in the destination buffer. */
vst1q_f32(pDst, __arm_vec_sqrt_f32_neon(vMagSqA));
pDst += 4;
vst1q_f32(pDst, __arm_vec_sqrt_f32_neon(vMagSqB));
pDst += 4;
/* Decrement the loop counter */
blkCnt--;
}
blkCnt = numSamples & 7;
#else
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
real = *pSrc++;
imag = *pSrc++;
/* store result in destination buffer. */
arm_sqrt_f32((real * real) + (imag * imag), pDst++);
real = *pSrc++;
imag = *pSrc++;
arm_sqrt_f32((real * real) + (imag * imag), pDst++);
real = *pSrc++;
imag = *pSrc++;
arm_sqrt_f32((real * real) + (imag * imag), pDst++);
real = *pSrc++;
imag = *pSrc++;
arm_sqrt_f32((real * real) + (imag * imag), pDst++);
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
#endif /* #if defined(ARM_MATH_NEON) */
while (blkCnt > 0U)
{
/* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
real = *pSrc++;
imag = *pSrc++;
/* store result in destination buffer. */
arm_sqrt_f32((real * real) + (imag * imag), pDst++);
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of cmplx_mag group
*/

Some files were not shown because too many files have changed in this diff Show More