* Copyright (c) 2026 Huawei Technologies Co., Ltd.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/
* \file kernel_simt_transcendental_impl.h
* \brief
*/
#ifndef IMPL_SIMT_API_CPP_DAV_C310_KERNEL_SIMT_TRANSCENDENTAL_IMPL_H
#define IMPL_SIMT_API_CPP_DAV_C310_KERNEL_SIMT_TRANSCENDENTAL_IMPL_H
#if defined(ASCENDC_CPU_DEBUG)
#include <cmath>
#include "kernel_utils.h"
#include "stub_def.h"
#endif
#include "impl/simt_api/cpp/dav_3510/kernel_simt_cmp_impl.h"
#include "impl/simt_api/cpp/dav_3510/kernel_simt_math_impl.h"
namespace AscendC {
namespace Simt {
#if defined(ASCENDC_CPU_DEBUG)
template <typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T ExpImpl(T x)
{
return exp(x);
}
#else
template <typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T ExpImpl(T x)
{
return __expf(x);
}
__SIMT_DEVICE_FUNCTIONS_DECL__ inline half2 ExpImpl(half2 x)
{
return __exp(x);
}
#endif
* Performs Payne-Hanek radian reduction for trigonometric functions.
* Reduce the input angle to the range [0, pi/2) and determine the quadrant.
*
* @param x The input angle in radians.
* @param outputQuadrant Pointer to store the quadrant information.
* @return The reduced angle in the range [0, pi/2).
*/
__SIMT_DEVICE_FUNCTIONS_DECL__ inline float PayneHanekRadianReduction(float x, int *outputQuadrant)
{
uint32_t inputBits = reinterpret_cast<uint32_t &>(x);
int32_t exponent = ((inputBits & 0x7F800000) >> 23) - 127;
uint32_t exponentIndex = static_cast<uint32_t>(exponent) >> 5;
constexpr uint32_t twoOverPiTable[] = {
0x517cc1b7, 0x27220a94, 0xfe13abe8, 0xfa9a6ee0, 0x6db14acc, 0x9e21c820
};
uint32_t highTerm = exponentIndex ? twoOverPiTable[exponentIndex - 1] : 0;
uint32_t midTerm = twoOverPiTable[exponentIndex];
uint32_t lowTerm = twoOverPiTable[exponentIndex + 1];
uint32_t lastTerm = twoOverPiTable[exponentIndex + 2];
int32_t exponentRemainder = static_cast<uint32_t>(exponent) & 0x1F;
if (exponentRemainder != 0) {
highTerm = (highTerm << exponentRemainder) | (midTerm >> (ConstantsInternal::FOUR_BYTE_LEN - exponentRemainder));
midTerm = (midTerm << exponentRemainder) | (lowTerm >> (ConstantsInternal::FOUR_BYTE_LEN - exponentRemainder));
lowTerm = (lowTerm << exponentRemainder) | (lastTerm >> (ConstantsInternal::FOUR_BYTE_LEN - exponentRemainder));
}
uint32_t mantissa = (inputBits & 0x007FFFFF) | 0x4F000000;
uint32_t normalizedMantissa = static_cast<uint32_t>(reinterpret_cast<float &>(mantissa));
uint64_t product = static_cast<uint64_t>(normalizedMantissa) * lowTerm;
product = static_cast<uint64_t>(normalizedMantissa) * midTerm + (product >> ConstantsInternal::FOUR_BYTE_LEN);
product = (static_cast<uint64_t>(normalizedMantissa * highTerm) << ConstantsInternal::FOUR_BYTE_LEN) + product;
int32_t quotient = static_cast<int32_t>(product >> 62);
product = product & 0x3FFFFFFFFFFFFFFFULL;
if (product & 0x2000000000000000ULL) {
product -= 0x4000000000000000ULL;
quotient += 1;
}
int64_t productInt64 = static_cast<int64_t>(product);
int64_t highFloat = static_cast<float>(productInt64);
productInt64 = productInt64 - static_cast<int64_t>(highFloat);
int64_t lowFloat = static_cast<float>(productInt64);
float piOverTwoLow = 3.4061215800865545e-19f;
float reducedAngle = (highFloat + lowFloat) * piOverTwoLow;
if (x < 0.0f) {
reducedAngle = -reducedAngle;
quotient = -quotient;
}
*outputQuadrant = quotient;
return reducedAngle;
}
* Performs Cody-Waite radian reduction for trigonometric functions.
* Reduce the input angle using a simpler method compared to Payne-Hanek.
*
* @param x The input angle in radians.
* @param quadrant Pointer to store the quadrant information.
* @return The reduced angle in the range [0, pi/2).
*/
__SIMT_DEVICE_FUNCTIONS_DECL__ inline float CodyWaiteRadianReduction(float x, int *quadrant)
{
float y = FmaImpl(x, 0.636619747f, 12582912.0f);
*quadrant = reinterpret_cast<int &>(y);
y = y - 12582912.0f;
x = FmaImpl(y, -1.57079601e+00f, x);
x = FmaImpl(y, -3.13916473e-07f, x);
return FmaImpl(y, -5.39030253e-15f, x);
}
* Determines the appropriate radian reduction method based on the input angle's magnitude.
* Uses Payne-Hanek for large angles and Cody-Waite for small angles.
*
* @param x The input angle in radians.
* @param threshold The threshold to decide between Payne-Hanek and Cody-Waite.
* @param quadrant Pointer to store the quadrant information.
* @return The reduced angle in the range [0, pi/2).
*/
__SIMT_DEVICE_FUNCTIONS_DECL__ inline float TrigRadianReduction(float x, float threshold, int *quadrant)
{
x = FmaImpl(x, 0.0f, x);
if (AbsImpl(x) > threshold) {
return PayneHanekRadianReduction(x, quadrant);
} else {
return CodyWaiteRadianReduction(x, quadrant);
}
}
* Computes the cosine of an angle using a polynomial approximation.
* Formula: cos(x) = 1-(1/2!)x^2+(1/4!)x^4+...+((-1)^n/(2n)!)x^2n+O(x^{2n+2})
* The input angle should be in the range [-pi/2, pi/2].
*
* @param x The input angle in radians.
* @return The cosine of the input angle.
*/
__SIMT_DEVICE_FUNCTIONS_DECL__ inline float CosPoly(float x)
{
x = x * x;
float y = FmaImpl(x, 2.44677067e-5f, -1.38877297e-3f);
y = FmaImpl(x, y, 4.16666567e-2f);
y = FmaImpl(x, y, -5.00000000e-1f);
return FmaImpl(x, y, 1.00000000e+0f);
}
* Computes the sine of an angle using a polynomial approximation.
* Formula: sin(x) = x-(1/3!)x^3+(1/5!)x^5+...+((-1)^n/(2n+1)!)x^{2n+1}+O(x^{2n+3})
* The input angle should be in the range [-pi/2, pi/2].
*
* @param x The input angle in radians.
* @return The sine of the input angle.
*/
__SIMT_DEVICE_FUNCTIONS_DECL__ inline float SinPoly(float x)
{
float y = x * x;
float m = FmaImpl(x, y, 0.0f);
float z = FmaImpl(y, 2.86567956e-6f, -1.98559923e-4f);
z = FmaImpl(y, z, 8.33338592e-3f);
z = FmaImpl(y, z, -1.66666672e-1f);
return FmaImpl(z, m, x);
}
* Computes the cosine of an angle using trigonometric identities and polynomial approximations.
* Handles angles in the full range by reducing them to [-pi/2, pi/2] and applying identities.
*
* @param x The input angle in radians.
* @return The cosine of the input angle.
* Special cases:
* if x is NaN, return NaN;
* if x is inf, return NaN;
*/
template <typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T CosImpl(T x)
{
static_assert(SupportTypeSimtInternel<T, float>, "Input type of input only supports float.");
int quadrant;
float y = TrigRadianReduction(x, 71476.0625f, &quadrant);
float c = CosPoly(y);
float s = SinPoly(y);
if (quadrant & 2) {
s = -s;
c = -c;
}
if (quadrant & 1) {
c = -s;
}
return c;
}
* Computes the sine of an angle using trigonometric identities and polynomial approximations.
* Handles angles in the full range by reducing them to [-pi/2, pi/2] and applying identities.
*
* @param x The input angle in radians.
* @return The sine of the input angle.
* Special cases:
* if x is NaN, return NaN;
* if x is inf, return NaN;
*/
template <typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T SinImpl(T x)
{
static_assert(SupportTypeSimtInternel<T, float>, "Input type of input only supports float.");
int quadrant;
float y = TrigRadianReduction(x, 71476.0625f, &quadrant);
float c = CosPoly(y);
float s = SinPoly(y);
if (quadrant & 2) {
s = -s;
c = -c;
}
if (quadrant & 1) {
s = c;
}
return s;
}
* Computes both sine and cosine of an angle using trigonometric identities and polynomial approximations.
* Handles angles in the full range by reducing them to [-pi/2, pi/2] and applying identities.
*
* @param x The input angle in radians.
* @param s Reference to store the sine of the input angle.
* @param c Reference to store the cosine of the input angle.
*/
template <typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline void SinCosImpl(T x, T &s, T &c)
{
static_assert(SupportTypeSimtInternel<T, float>, "Input type of input only supports float.");
int quadrant;
float t;
float y = TrigRadianReduction(x, 71476.0625f, &quadrant);
float cos = CosPoly(y);
float sin = SinPoly(y);
if (quadrant & 2) {
sin = -sin;
cos = -cos;
}
if (quadrant & 1) {
t = -sin;
sin = cos;
cos = t;
}
s = sin;
c = cos;
}
* Computes the tangent of an angle using a polynomial approximation.
* The input angle should be in the range [-pi/2, pi/2].
* Formula: tanx=x+(1/3)*x^3+(2/15)*x^5+(17/315)*x^7+(62/2835)*x^9+(1382/155925)*x^11+
* +(21844/6081075)*x^13+(929569/638512875)*x^15+...+O(...)
*
* @param x The input angle in radians.
* @return The tangent of the input angle.
*/
__SIMT_DEVICE_FUNCTIONS_DECL__ inline float TanPoly(float x)
{
x = x * x;
float y = FmaImpl(x, 4.38117981e-3f, 8.94600598e-5f);
y = FmaImpl(x, y, 1.08341556e-2f);
y = FmaImpl(x, y, 2.12811474e-2f);
y = FmaImpl(x, y, 5.40602170e-2f);
y = FmaImpl(x, y, 1.33326918e-1f);
y = FmaImpl(x, y, 3.33333433e-1f);
return x * y;
}
* Computes the tangent of an angle using trigonometric identities and polynomial approximations.
* Handles angles in the full range by reducing them to [-pi/2, pi/2] and applying identities.
*
* @param x The input angle in radians.
* @return The tangent of the input angle.
*/
template <typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T TanImpl(T x)
{
static_assert(SupportTypeSimtInternel<T, float>, "Input type of input only supports float.");
int quadrant;
float y = TrigRadianReduction(x, 252.898206f, &quadrant);
float t = TanPoly(y);
float z = FmaImpl(t, y, y);
if (quadrant & 1) {
float s = y - z;
s = FmaImpl(t, y, s);
t = -1.0f / z;
z = FmaImpl(z, t, 1.0f);
z = FmaImpl(s, t, z);
z = FmaImpl(z, t, t);
}
return z;
}
#if defined(ASCENDC_CPU_DEBUG)
__SIMT_DEVICE_FUNCTIONS_DECL__ inline float TanhImpl(float x)
{
return tanh(x);
}
#else
__SIMT_DEVICE_FUNCTIONS_DECL__ inline float TanhImpl(float x)
{
return 1.0f - (2.0f / (ExpImpl(2.0f * x) + 1.0f));
}
#endif
template <typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline float TanPiImpl(T x)
{
static_assert(SupportTypeSimtInternel<T, float>, "Input type of input only supports float.");
return TanImpl(x * ConstantsInternal::PI);
}
__SIMT_DEVICE_FUNCTIONS_DECL__ inline void TaylorExpand(float &dst, float &src, float &squareV, uint32_t expandLevel, float *factor)
{
squareV = src * src;
dst = src * src;
dst = dst * factor[expandLevel];
for (int i = expandLevel - 1; i > 0; i--) {
dst = dst + factor[i];
dst = dst * squareV;
}
dst = dst + factor[0];
dst = dst * src;
}
__SIMT_DEVICE_FUNCTIONS_DECL__ inline void TaylorExpand(float &dst, float &src, float &squareV, uint32_t expandLevel)
{
float factor[] = {1,
-0.3333333333333333,
0.2,
-0.14285714285714285,
0.1111111111111111,
-0.09090909090909091,
0.07692307692307693};
TaylorExpand(dst, src, squareV, expandLevel, factor);
}
#if defined(ASCENDC_CPU_DEBUG)
__SIMT_DEVICE_FUNCTIONS_DECL__ inline float AtanImpl(float x)
{
return atan(x);
}
#else
__SIMT_DEVICE_FUNCTIONS_DECL__ inline void AtanExpand(float &dst, float &src, float &tmp, float transFactor)
{
dst = src * transFactor;
dst = dst + 1.0f;
tmp = src - transFactor;
dst = tmp / dst;
dst = AbsImpl(dst);
}
__SIMT_DEVICE_FUNCTIONS_DECL__ inline void Sign(float &dst, float &src, float &denominator)
{
dst = src * 4611686018427387904.0f;
denominator = AbsImpl(dst);
denominator = denominator + 2.168404344971009e-19f;
dst = dst / denominator;
}
__SIMT_DEVICE_FUNCTIONS_DECL__ inline float AtanImpl(float x)
{
float clip = MinImpl(x, 10000.0f);
clip = MaxImpl(clip, -10000.0f);
float absV = AbsImpl(clip);
float dst = 0;
float squareV = 0;
float tmp = 0;
float tmp2 = 0;
TaylorExpand(dst, absV, squareV, 4);
AtanExpand(tmp, absV, tmp2, 0.4142135623730950);
TaylorExpand(tmp2, tmp, squareV, 4);
tmp2 = tmp2 + ConstantsInternal::PI_OF_8;
dst = MinImpl(dst, tmp2);
tmp2 = absV + 1.0f;
tmp = absV - 1.0f;
tmp = tmp / tmp2;
tmp = AbsImpl(tmp);
TaylorExpand(tmp2, tmp, squareV, 4);
tmp2 = tmp2 + ConstantsInternal::PI_OF_4;
dst = MinImpl(dst, tmp2);
AtanExpand(tmp2, tmp, squareV, 0.4142135623730950);
TaylorExpand(tmp, tmp2, squareV, 6);
tmp = tmp + ConstantsInternal::PI_OF_8;
tmp = tmp + ConstantsInternal::PI_OF_4;
dst = MinImpl(dst, tmp);
Sign(tmp, clip, tmp2);
dst = dst * tmp;
return dst;
}
#endif
#if defined(ASCENDC_CPU_DEBUG)
__SIMT_DEVICE_FUNCTIONS_DECL__ inline float Atan2Impl(float y, float x)
{
return atan2(y, x);
}
#else
atan2(y, x) =
atan(y/x) x > 0
atan(y/x) + PI y >= 0, x < 0
atan(y/x) - PI y < 0, x < 0
PI/2 y > 0, x = 0
-PI/2 y < 0, x = 0
0 y = 0, x = 0
*/
__SIMT_DEVICE_FUNCTIONS_DECL__ inline float Atan2Impl(float y, float x)
{
if (IsNanImpl(y)) {
return y;
} else if (IsNanImpl(x)) {
return x;
}
int d = (y >= 0) ? 1 : -1;
if (IsInfImpl(y) && IsInfImpl(x)) {
int s = 1;
if (x < 0) {
s = 3;
}
return d * ConstantsInternal::PI_OF_4 * s;
} else if (IsInfImpl(y)) {
return d * ConstantsInternal::PI_OF_2;
} else if (IsInfImpl(x)) {
if (x > 0) {
d = 0;
}
return d * ConstantsInternal::PI;
}
if (x == 0) {
if (y == 0) {
d = 0;
}
return d * ConstantsInternal::PI_OF_2;
} else if (x > 0) {
d = 0;
}
return AtanImpl(y / x) + d * ConstantsInternal::PI;
}
#endif
#if defined(ASCENDC_CPU_DEBUG)
template <typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T LogImpl(T x)
{
return logf(x);
}
#else
template <typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T LogImpl(T x)
{
if (x > 0.0f && x < 1.17549435e-38f) {
return __logf(ExpImpl(23.0f) * x) - 23.0f;
}
return __logf(x);
}
__SIMT_DEVICE_FUNCTIONS_DECL__ inline half2 LogImpl(half2 x)
{
return __log(x);
}
#endif
#if defined(ASCENDC_CPU_DEBUG)
__SIMT_DEVICE_FUNCTIONS_DECL__ inline float AtanhImpl(float x)
{
return atanh(x);
}
#else
__SIMT_DEVICE_FUNCTIONS_DECL__ inline float AtanhImpl(float x)
{
return LogImpl((1.0f + x) / (1.0f - x)) / 2.0f;
}
#endif
* Rquare root of X
* Formula:Sqrt(x) = sqrt(x)
* @param x a value
* @return SqrtImpl(x)
* Special cases:
* if x is NaN, return NaN;
* if x is inf, return inf;
* if x is -inf, return NaN;
*/
template <typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T SqrtImpl(T x)
{
static_assert(SupportTypeSimtInternel<T, float>, "Input type of input only supports float.");
#if defined(ASCENDC_CPU_DEBUG)
return sqrt(x);
#else
return __sqrtf(x);
#endif
}
* Reciprocal of Sqrt(x)
* Formula:Rsqrt(x) = 1 / sqrt(x)
* @param x a value
* @return RsqrtImpl(x)
* Special cases:
* if x is NaN, return NaN;
* if x is inf, return 0;
* if x is -inf, return NaN;
*/
template <typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T RsqrtImpl(T x)
{
static_assert(SupportTypeSimtInternel<T, float>, "Input type of input only supports float.");
return 1.0f / SqrtImpl(x);
}
* * Computes cosh values based on input.
* * Formula: cosh(x) = (e^x+e^(-x))/2 = e^(x-ln2) + 0.25/(e^(x-ln2)).
* *
* * @param x
* * @return Cosh(x)
* * Special cases:
* * if x is NaN, return NaN;
* * if x = 89.14 y = 2.5821419964548915e+38, exp(89.14-Ln(2)) is limit overflow in 32
* * if x = -89.14 y = 2.5821419964548915e+38, exp(-89.14-Ln(2)) limit 0 in 32 bit,
* * 0.25/exp(-89.14-Ln(2)) is overflow in 32 bit
* * use Cosh(x) = Cosh(-x) to solve exp(-89.14-Ln(2)) overflow
* */
template <typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T CoshImpl(T x)
{
static_assert(SupportTypeSimtInternel<T, float>, "Input type of input only supports float.");
float y = AbsImpl(x);
const float tmp = ExpImpl(y - ConstantsInternal::SCALAR_LN2);
return tmp + 0.25f / tmp;
}
* Computes cospi values based on input.
* Formula: cospi(x) = cos(x*PI)
*
* @param x
* @return Cospi(x)
* Special cases:
* if x is NaN, return NaN;
* if x is Inf, return NaN;
*/
template <typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T CospiImpl(T x)
{
static_assert(SupportTypeSimtInternel<T, float>, "Input type of input only supports float.");
return CosImpl(x * ConstantsInternal::PI);
}
* Computes cos values based on input.
* Formula:
* asin(x) = asin(sqrt(1-x^2)) - pi/2, x belongs to (-1, -2^(-0.5))
* asin(x) = the taylor expansion, x belongs to (-2^(-0.5), 2^(-0.5))
* asin(x) = pi/2 -asin(sqrt(1-x^2)), x belongs to (2^(-0.5), 1)
* taylor : (((k_nx^2 + k_n) * x^2 + k_(n-1)) * x^2 +k_(n-2) ……)*x^2 +k_0)*x.
* if x in [-1, 1]
* x >= -1 && x < 0 && x^2 > 0.5f => x belongs to (-1, -2^(-0.5))
* x > 0 && x <= 1 && x^2 > 0.5f => x belongs to (2^(-0.5), 1)
* other, x belongs to (-2^(-0.5), 2^(-0.5))
*
* @param x
* @return Asin(x)
* Special cases:
* if x is NaN, return x itself;
* if x is inf, return NaN;
* if |x|>1, return NaN with invalid signal.
*/
template <typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T AsinImpl(T x)
{
static_assert(SupportTypeSimtInternel<T, float>, "Input type of input only supports float.");
if (AbsImpl(x) > 1) {
return ConstantsInternal::SIMT_FP32_INF / ConstantsInternal::SIMT_FP32_INF;
}
float squareV = 0;
float dst = 0;
float src = x;
float factor[] = {
1.0,
0.16666666666666666666666666666667,
0.075,
0.04464285714285714285714285714286,
0.03038194444444444444444444444444,
0.02237215909090909090909090909091,
0.01735276442307692307692307692308,
0.01396484375,
};
if (AbsImpl(x) <= 0.7071067811865476f) {
TaylorExpand(dst, src, squareV, 7, factor);
return dst;
} else if (x < -0.7071067811865476f) {
src = SqrtImpl(1.0f - x * x);
TaylorExpand(dst, src, squareV, 7, factor);
return dst - ConstantsInternal::PI_OF_2;
} else {
src = SqrtImpl(1.0f - x * x);
TaylorExpand(dst, src, squareV, 7, factor);
return ConstantsInternal::PI_OF_2 - dst;
}
}
* Computes Acos values based on input.
* Formula: Acos(x) = pi/2-Asin(x).
*
* @param x
* @return Acos(x)
* Special cases:
* if x is NaN, return x itself;
* if x is inf, return NaN;
* if |x|>1, return NaN.
*/
template <typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T AcosImpl(T x)
{
static_assert(SupportTypeSimtInternel<T, float>, "Input type of input only supports float.");
return ConstantsInternal::PI_OF_2 - AsinImpl(x);
}
* Computes Acosh values based on input.
* Formula:acosh(x) = ln(x + sqrt(x^2 - 1))
*
* @param x
* @return Acosh(x)
* Special cases:
* if x is NaN, return x itself;
* if x<-1, return NaN.
*/
template <typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T AcoshImpl(T x)
{
static_assert(SupportTypeSimtInternel<T, float>, "Input type of input only supports float.");
if (x < 1) {
return ConstantsInternal::SIMT_FP32_INF / ConstantsInternal::SIMT_FP32_INF;
}
return LogImpl(x + SqrtImpl(x * x - 1.0f));
}
* Computes Sinh values based on input.
* Formula: sinh(x) = (e^x-e^(-x))/2 = e^(x-ln2) - 0.25/(e^(x-ln2)).
*
* @param x
* @return Sinh(x)
* Special cases:
* if x is NaN, return NaN;
*/
template <typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T SinhImpl(T x)
{
static_assert(SupportTypeSimtInternel<T, float>, "Input type of input only supports float.");
if (AbsImpl(x) > 0.1f) {
return ExpImpl(x - ConstantsInternal::SCALAR_LN2) - ExpImpl(x * (-1.0f) - ConstantsInternal::SCALAR_LN2);
} else {
float squareV = 0;
float dst = 0;
float src = x;
float factor[] = {1.0,
0.16666666666666666666666666666667,
0.00833333333333333333333333333333,
0.0001984126984126984,
2.7557319223985893e-06,
2.505210838544172e-08};
TaylorExpand(dst, src, squareV, 5, factor);
return dst;
}
}
* Computes Sinpi values based on input.
* Formula: sinpi(x) = sin(x*PI)
*
* @param x
* @return Sinpi(x)
* Special cases:
* if x is NaN, return NaN;
* if x is Inf, return NaN;
*/
template <typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T SinpiImpl(T x)
{
static_assert(SupportTypeSimtInternel<T, float>, "Input type of input only supports float.");
return SinImpl(x * ConstantsInternal::PI);
}
* Computes Asinh values based on input.
* Formula:asinh(x) = ln(x + sqrt(x^2 + 1))
* asinh(x) = -asinh(-x)
*
* @param x
* @return Asinh(x)
* Special cases:
* if x is NaN, return x itself;
* if inf, return inf.
*/
template <typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T AsinhImpl(T x)
{
static_assert(SupportTypeSimtInternel<T, float>, "Input type of input only supports float.");
if (AbsImpl(x) > 0.1f) {
return x > 0 ? LogImpl(x + SqrtImpl(x * x + 1.0f)) : LogImpl(SqrtImpl(x * x + 1.0f) - x) * (-1);
} else {
float squareV = 0;
float dst = 0;
float src = x;
float factor[] = {
1.0,
-0.16666666666666666666666666666667,
0.075,
-0.04464285714285714285714285714286,
0.03038194444444444444444444444444,
-0.02237215909090909090909090909091,
0.01735276442307692307692307692308,
-0.01396484375,
};
TaylorExpand(dst, src, squareV, 7, factor);
return dst;
}
}
template <typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline void SinCospiImpl(T x, T &s, T &c)
{
static_assert(SupportTypeSimtInternel<T, float>, "Input type of input only supports float.");
return SinCosImpl(x * ConstantsInternal::PI, s, c);
}
* Computes Hypot values based on input.
* Formula:HypotImpl(x, y) = sqrt(x^2 + y^2)
*
* @param x, y
* @return HypotImpl(x, y)
* Special cases:
* HypotImpl(x,y) is INF if x or y is +INF or -INF; else
* HypotImpl(x,y) is NAN if x or y is NAN.
*/
template <typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T HypotImpl(T x, T y)
{
static_assert(SupportTypeSimtInternel<T, float>, "Input type of input only supports float.");
float absX = AbsImpl(x);
float absY = AbsImpl(y);
if (IsPositiveInfImpl(absX) || IsPositiveInfImpl(absY)) {
return ConstantsInternal::SIMT_FP32_INF;
}
if (IsNanImpl(absX)) {
return absX;
}
if (IsNanImpl(absY)) {
return absY;
}
float a = MaxImpl(absX, absY);
float b = MinImpl(absX, absY);
if (b == 0.0f) {
return a;
}
float r = b / a;
return a * SqrtImpl(FmaImpl(r, r, 1.0f));
}
* Computes Rhypot values based on input.
* Formula:RhypotImpl(x, y) = 1 / sqrt(x^2 + y^2) = 1 / HypotImpl(x, y)
*
* @param x a value
* @param y a value
* @return RhypotImpl(x, y)
* Special cases:
* RhypotImpl(x,y) is 0 if x or y is +INF or -INF; else
* RhypotImpl(x,y) is NAN if x or y is NAN.
*/
template <typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T RhypotImpl(T x, T y)
{
static_assert(SupportTypeSimtInternel<T, float>, "Input type of input only supports float.");
return 1.0f / HypotImpl(x, y);
}
* Break VALUE into a normalized fraction and an integral power of 2.
* Formula:x = Frexp(x, exp) * 2^exp
*
* @param x a value
* @param &exp a value
* @return FrexpImpl(x, &exp)
* Special cases:
* if x is NaN, return x itself, exp=0;
* if x is inf, return x itself, exp=0;
* if x is -inf, return x itself, exp=0;
*/
template <typename T1, typename T2>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T1 FrexpImpl(T1 x, T2 &exp)
{
static_assert(SupportTypeSimtInternel<T1, float>, "Input type of input(x) only supports float.");
static_assert(SupportTypeSimtInternel<T2, int>, "Input type of input(exp) only supports int.");
if (x == 0.0f || IsPositiveInfImpl(AbsImpl(x)) || IsNanImpl(x)) {
exp = 0;
return x;
}
uint32_t u32 = reinterpret_cast<uint32_t &>(x);
int32_t exponent = u32 & 0x7f800000;
int32_t f32ExpVal = exponent >> 23;
uint32_t manU32 = u32 & 0x007fffff;
float f32ManU32 = static_cast<float>(manU32);
f32ManU32 = f32ManU32 / (1 << 23);
if (f32ExpVal == 0) {
if (f32ManU32 < 0.5f) {
while (f32ManU32 < 0.5f) {
f32ManU32 = f32ManU32 * 2;
f32ExpVal--;
}
}
} else {
f32ManU32 = f32ManU32 / 2 + 0.5f;
}
exp = f32ExpVal - 126;
return CopySignImpl(f32ManU32, x);
}
* X times (two to the EXP power).
* Formula:Ldexp(x, exp) = x * 2^exp
* @param x a value
* @param exp a value
* @return LdexpImpl(x, &exp)
* Special cases:
* if x is NaN, return x itself;
* if x is inf, return x itself;
* if x is -inf, return x itself;
*/
template <typename T1, typename T2>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T1 LdexpImpl(T1 x, T2 exp)
{
static_assert(SupportTypeSimtInternel<T1, float>, "Input type of input only supports float.");
static_assert(SupportTypeSimtInternel<T2, int>, "Input type of input(exp) only supports int.");
if (x == 0.0f || IsPositiveInfImpl(AbsImpl(x)) || IsNanImpl(x) || exp == 0) {
return x;
}
if (exp > 280) {
return CopySignImpl(ConstantsInternal::SIMT_FP32_INF, x);
}
if (exp < -280) {
return CopySignImpl(0.0f, x);
}
int32_t shift = 30;
if (exp > 0) {
while (exp > shift) {
x *= (1 << shift);
exp -= shift;
}
x *= (1 << exp);
} else {
while (exp < -30) {
x *= 1.0f / (1 << shift);
exp += shift;
}
x *= 1.0f / (1 << (-exp));
}
return x;
}
* Calculate the square root of the sum of squares of 3D coordinates.
* Formula:Norm3dImpl(a, b, c) = sqrt(a^2 + b^2 + c^2)
* = max(a,b,c)*sqrt({a/max}^2 + {b/max}^2 + {c/max}^2)
*
* @param a float value
* @param b float value
* @param c float value
* @return Norm3dImpl(a, b, c)
* Special cases:
* If any one of a,b,c is ±INF, return INF.
* If any one of a,b,c is NAN and other is not ±INF, return NAN.
* If all of a,b,c is 0, return 0.
* If sqrt(a^2 + b^2 + c^2) overflows, return INF.
*/
template <typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T Norm3dImpl(T a, T b, T c)
{
static_assert(SupportTypeSimtInternel<T, float>, "Input type only supports float.");
if (IsInfImpl(a) || IsInfImpl(b) || IsInfImpl(c)) {
return ConstantsInternal::SIMT_FP32_INF;
}
if (IsNanImpl(a) || IsNanImpl(b) || IsNanImpl(c)) {
return ConstantsInternal::SIMT_FP32_INF / ConstantsInternal::SIMT_FP32_INF;
}
float m = MaxImpl(AbsImpl(a), AbsImpl(b));
m = MaxImpl(m, AbsImpl(c));
if (m == 0.0f) {
return 0.0f;
}
float r = 0.0f;
r = FmaImpl((a / m), (a / m), r);
r = FmaImpl((b / m), (b / m), r);
r = FmaImpl((c / m), (c / m), r);
return m * SqrtImpl(r);
}
* Calculate the reciprocal of the square root of the sum of squares of 3D coordinates.
* Formula:Rnorm3dImpl(a, b, c) = 1 / sqrt(a^2 + b^2 + c^2)
* = 1 / Norm3dImpl(a, b, c)
*
* @param a float value
* @param b float value
* @param c float value
* @return Norm3dImpl(a, b, c)
* Special cases:
* If any one of a,b,c is ±INF, return 0.
* If any one of a,b,c is NAN and other is not ±INF, return NAN.
* If all of a,b,c is 0, return INF.
* If sqrt(a^2 + b^2 + c^2) overflows, return INF.
*/
template <typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T Rnorm3dImpl(T a, T b, T c)
{
static_assert(SupportTypeSimtInternel<T, float>, "Input type only supports float.");
return 1.0f / Norm3dImpl(a, b, c);
}
* Calculate the square root of the sum of squares of 4D coordinates.
* Formula:Norm4dImpl(a, b, c, d) = sqrt(a^2 + b^2 + c^2 + d^2)
* = max(a,b,c,d)*sqrt({a/max}^2 + {b/max}^2 + {c/max}^2+ {d/max}^2)
*
* @param a float value
* @param b float value
* @param c float value
* @param d float value
* @return Norm4dImpl(a, b, c, d)
* Special cases:
* If any one of a,b,c,d is ±INF, return INF.
* If any one of a,b,c,d is NAN and other is not ±INF, return NAN.
* If all of a,b,c,d is 0, return 0.
* If sqrt(a^2 + b^2 + c^2+ d^2) overflows, return INF.
*/
template <typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T Norm4dImpl(T a, T b, T c, T d)
{
static_assert(SupportTypeSimtInternel<T, float>, "Input type only supports float.");
if (IsInfImpl(a) || IsInfImpl(b) || IsInfImpl(c) || IsInfImpl(d)) {
return ConstantsInternal::SIMT_FP32_INF;
}
if (IsNanImpl(a) || IsNanImpl(b) || IsNanImpl(c) || IsNanImpl(d)) {
return ConstantsInternal::SIMT_FP32_INF / ConstantsInternal::SIMT_FP32_INF;
}
float m = MaxImpl(AbsImpl(a), AbsImpl(b));
m = MaxImpl(m, AbsImpl(c));
m = MaxImpl(m, AbsImpl(d));
if (m == 0.0f) {
return 0.0f;
}
float r = 0.0f;
r = FmaImpl((a / m), (a / m), r);
r = FmaImpl((b / m), (b / m), r);
r = FmaImpl((c / m), (c / m), r);
r = FmaImpl((d / m), (d / m), r);
return m * SqrtImpl(r);
}
* Calculate the reciprocal of the square root of the sum of squares of 4D coordinates.
* Formula:Rnorm4dImpl(a, b, c, d) = 1 / sqrt(a^2 + b^2 + c^2 + d^2)
* = 1 / Norm4dImpl(a, b, c, d)
*
* @param a float value
* @param b float value
* @param c float value
* @param d float value
* @return Rnorm4dImpl(a, b, c, d)
* Special cases:
* If any one of a,b,c,d is ±INF, return 0.
* If any one of a,b,c,d is NAN and other is not ±INF, return NAN.
* If all of a,b,c,d is 0, return INF.
* If sqrt(a^2 + b^2 + c^2 + d^2) overflows,return 0.
*/
template <typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T Rnorm4dImpl(T a, T b, T c, T d)
{
static_assert(SupportTypeSimtInternel<T, float>, "Input type only supports float.");
return 1.0f / Norm4dImpl(a, b, c, d);
}
* Calculate the square root of the sum of squares of N coordinates.
* Formula:NormImpl(n, a) = sqrt(a[0]^2 +... + a[n-1]^2)
* = max(a[i]) * sqrt({a[0]/max}^2+...+{a[n-i]/max}^2)
*
* @param n int value, computes dimension of a
* @param a float* value, computes data
* @return NormImpl(n, a)
* Special cases:
* If any one of a[i] is ±INF, return 0.
* If any one of a[i] is NAN and other is not ±INF, return NAN.
* If all of a,b,c,d is 0, return 0.
* If sqrt(a[0]^2 +... + a[n-1]^2) overflows, return INF.
* If n is less than 1, return |a[0]|.
*/
template <typename T1, typename T2>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T2 NormImpl(T1 n, T2* a)
{
if (n <= 0) {
return AbsImpl(a[0]);
}
float m = 0;
int32_t hasNan = 0;
for (int i = 0; i < n; i++) {
m = MaxImpl(m, AbsImpl(a[i]));
if (IsInfImpl(a[i])) {
return ConstantsInternal::SIMT_FP32_INF;
}
if (IsNanImpl(a[i])) {
hasNan = 1;
}
}
if (hasNan) {
return ConstantsInternal::SIMT_FP32_INF / ConstantsInternal::SIMT_FP32_INF;
}
if (m == 0.0f || IsNanImpl(m)) {
return m;
}
float sum = 0.0f;
for (int i = 0; i < n; i++) {
sum = FmaImpl((a[i] / m), (a[i] / m), sum);
}
return m * SqrtImpl(sum);
}
* Calculate the reciprocal of the square root of the sum of squares of N coordinates.
* Formula:RnormImpl(n, a) = 1 / sqrt(a[0]^2 +... + a[n-1]^2)
* = 1 / NormImpl(n, a)
*
* @param n int value, computes dimension of a
* @param a float* value, computes data
* @return RnormImpl(n, a)
* Special cases:
* If any one of a[i] is ±INF, return 0.
* If any one of a[i] is NAN and other is not ±INF, return NAN.
* If all of a[i] is 0, return INF.
* If sqrt(a[0]^2 +... + a[n-1]^2) overflows, return 0.
* If n is less than 1, return 1/|a[0]|.
*/
template <typename T1, typename T2>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T2 RnormImpl(T1 n, T2* a)
{
return 1.0f / NormImpl(n, a);
}
template <typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T PowImpl(T x, T y)
{
static_assert(SupportTypeSimtInternel<T, float>, "Input type only supports float.");
#if defined(ASCENDC_CPU_DEBUG)
if (x < 0.0f) {
return NAN;
}
if (AbsImpl(x) == 1.0f && std::isinf(y)) {
return NAN;
}
if (x == 1.0f && std::isnan(y)) {
return NAN;
}
if ((x == 0.0f || std::isnan(x) || std::isinf(x)) && y == 0.0f) {
return NAN;
}
return pow(x, y);
#else
return __powf(x, y);
#endif
}
__SIMT_DEVICE_FUNCTIONS_DECL__ inline float Exp2Impl(float x)
{
return PowImpl(2.0f, x);
}
__SIMT_DEVICE_FUNCTIONS_DECL__ inline float Exp10Impl(float x)
{
return PowImpl(10.0f, x);
}
__SIMT_DEVICE_FUNCTIONS_DECL__ inline float Expm1Impl(float x)
{
return ExpImpl(x) - 1.0f;
}
#if defined(ASCENDC_CPU_DEBUG)
__SIMT_DEVICE_FUNCTIONS_DECL__ inline float Log2Impl(float x)
{
return log2(x);
}
#else
__SIMT_DEVICE_FUNCTIONS_DECL__ inline float Log2Impl(float x)
{
return LogImpl(x) / LogImpl(2.0f);
}
#endif
#if defined(ASCENDC_CPU_DEBUG)
__SIMT_DEVICE_FUNCTIONS_DECL__ inline float Log10Impl(float x)
{
return log10(x);
}
#else
__SIMT_DEVICE_FUNCTIONS_DECL__ inline float Log10Impl(float x)
{
return LogImpl(x) / LogImpl(10.0f);
}
#endif
__SIMT_DEVICE_FUNCTIONS_DECL__ inline float Log1pImpl(float x)
{
return LogImpl(1.0f + x);
}
#if defined(ASCENDC_CPU_DEBUG)
__SIMT_DEVICE_FUNCTIONS_DECL__ inline float LogbImpl(float x)
{
if (x < 0) {
x = -x;
}
return floor(log2(x));
}
#else
__SIMT_DEVICE_FUNCTIONS_DECL__ inline float LogbImpl(float x)
{
if (IsNanImpl(x)) {
return x;
}
if (x < 0) {
x = -x;
}
float inf = ConstantsInternal::SIMT_FP32_INF;
if (IsInfImpl(x)) {
return inf;
}
if (x == 0) {
return -inf;
}
uint32_t fp32InfExponent = 255;
uint32_t fp32DecimalBit = 23;
uint32_t fp32SignBit = 256;
uint32_t fp32ExponentH = 127;
uint32_t *exponent = (uint32_t *)&x;
(*exponent) >>= fp32DecimalBit;
uint32_t sign = fp32SignBit;
if ((*exponent) > sign) {
(*exponent) -= sign;
}
if ((*exponent) == fp32InfExponent) {
return inf;
} else {
float res = (*exponent);
res -= fp32ExponentH;
return res;
}
}
#endif
#if defined(ASCENDC_CPU_DEBUG)
__SIMT_DEVICE_FUNCTIONS_DECL__ inline int ILogbImpl(float x)
{
if (x < 0) {
x = -x;
}
if (x == INFINITY) {
return ConstantsInternal::S32_MAX_VAL;
}
return static_cast<int>(LogbImpl(x));
}
#else
__SIMT_DEVICE_FUNCTIONS_DECL__ inline int ILogbImpl(float x)
{
if (x < 0) {
x = -x;
}
if (IsNanImpl(x)) {
return ConstantsInternal::S32_MIN_VAL;
}
return static_cast<int>(LogbImpl(x));
}
#endif
* calculates a cube root by input x.
* @param x a value
* @return cbrt(x)
* Special cases:
* if x is 0, return 0
* if x is Nan, return Nan;
* if x is Inf, return Inf;
* if x is -Inf, return -Inf;
*/
template <typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T CbrtImpl(T x)
{
static_assert(SupportTypeSimtInternel<T, float>, "Input value type only supports float.");
uint32_t xBits = *reinterpret_cast<uint32_t *>(&x);
int32_t expBits = (xBits >> 23) & 0xFF;
if (x == 0.0f || expBits == 0xFF) {
return x;
}
int32_t exponent = expBits - 127;
int32_t k;
if (exponent >= 3) {
k = ((exponent - 3) / 3) + 1;
} else if (exponent <= -4) {
k = (exponent + 1) / 3;
} else {
k = 0;
}
int32_t expAdjustedBits = exponent - 3 * k + 127;
uint32_t xAdjustedBits = (xBits & 0x7FFFFF) | (expAdjustedBits << 23);
float xAdjusted = *reinterpret_cast<float *>(&xAdjustedBits);
float y = 1.0f;
y = (2.0f * y + xAdjusted / (y * y)) / 3.0f;
y = (2.0f * y + xAdjusted / (y * y)) / 3.0f;
y = (2.0f * y + xAdjusted / (y * y)) / 3.0f;
y = (2.0f * y + xAdjusted / (y * y)) / 3.0f;
y = (2.0f * y + xAdjusted / (y * y)) / 3.0f;
uint32_t yBits = *reinterpret_cast<uint32_t *>(&y);
int32_t yExpBits = ((yBits >> 23) & 0xFF) + k;
yBits = (yBits & 0x807FFFFF) | ((yExpBits & 0xFF) << 23) | (xBits & 0x80000000);
return *reinterpret_cast<float *>(&yBits);
}
* calculates reciprocal of the cube root by input x.
* @param x a value
* @return rcbrt(x)
* Special cases:
* if x is 0, return Inf
* if x is Nan, return Nan;
* if x is Inf, return 0;
* if x is -Inf, return 0;
*/
template <typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T RcbrtImpl(T x)
{
static_assert(SupportTypeSimtInternel<T, float>, "Input value type only supports float.");
if (x == 0.0f) {
return ConstantsInternal::SIMT_FP32_INF;
}
if (IsNanImpl(x)) {
return x;
}
if (IsInfImpl(x)) {
return 0.0f;
}
uint32_t xBits = *reinterpret_cast<uint32_t *>(&x);
int32_t expBits = (xBits >> 23) & 0xFF;
int32_t yExpBits = (508 - expBits) / 3;
uint32_t yBits = (xBits & 0x80000000) | (yExpBits << 23);
float y = *reinterpret_cast<float *>(&yBits);
y = y * (4.0f - x * y * y * y) / 3.0f;
y = y * (4.0f - x * y * y * y) / 3.0f;
y = y * (4.0f - x * y * y * y) / 3.0f;
y = y * (4.0f - x * y * y * y) / 3.0f;
y = y * (4.0f - x * y * y * y) / 3.0f;
return y;
}
#if defined(ASCENDC_CPU_DEBUG)
template <typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T ErfImpl(T x)
{
return erf(x);
}
#else
* Calculate the error function of the input x.
* @param x a value
* @return erf(x)
* Special cases:
* if x is 0, return 0
* if x is Inf, return 1;
* if x is -Inf, return -1;
* if x is Nan, return Nan;
*/
template <typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T ErfImpl(T x)
{
float absX = AbsImpl(x);
float xSquared = x * x;
if (absX >= 1.00296f) {
float term = absX;
const float a1 = 0.000112198715f;
const float a2 = -0.0013275252f;
const float a3 = 0.008396535f;
const float a4 = -0.040246583f;
const float a5 = 0.15950431f;
const float a6 = 0.9129177f;
const float a7 = 0.62906002f;
float polyTerm = FmaImpl(a1, term, a2);
polyTerm = FmaImpl(polyTerm, term, a3);
polyTerm = FmaImpl(polyTerm, term, a4);
polyTerm = FmaImpl(polyTerm, term, a5);
polyTerm = FmaImpl(polyTerm, term, a6);
polyTerm = FmaImpl(polyTerm, term, a7);
float result = FmaImpl(polyTerm, -absX, -absX);
float expResult = Exp2Impl(result);
float adjustedExp = 1.0f - expResult;
uint32_t signBit = *reinterpret_cast<uint32_t *>(&x) & 0x80000000;
uint32_t finalBits = signBit | *reinterpret_cast<uint32_t *>(&adjustedExp);
return *reinterpret_cast<float *>(&finalBits);
} else {
float term = xSquared;
const float a1 = 0.000084834944f;
const float a2 = -0.00082130916f;
const float a3 = 0.005213489f;
const float a4 = -0.026868773f;
const float a5 = 0.11284005f;
const float a6 = -0.37612664f;
const float a7 = 0.12837915f;
float polyTerm = FmaImpl(a1, term, a2);
polyTerm = FmaImpl(polyTerm, term, a3);
polyTerm = FmaImpl(polyTerm, term, a4);
polyTerm = FmaImpl(polyTerm, term, a5);
polyTerm = FmaImpl(polyTerm, term, a6);
polyTerm = FmaImpl(polyTerm, term, a7);
return FmaImpl(polyTerm, x, x);
}
}
#endif
#if defined(ASCENDC_CPU_DEBUG)
template <typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T ErfcImpl(T x)
{
return erfc(x);
}
#else
* Calculate the complementary error function of the input x.
* @param x a value
* @return erfc(x)
* Special cases:
* if x is Inf, return 2;
* if x is -Inf, return +0;
* if x is Nan, return Nan;
*/
template <typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T ErfcImpl(T x)
{
float absX = AbsImpl(x);
float term1 = absX + -4.0f;
float term2 = absX + 4.0f;
float invTerm2 = 1.0f / term2;
float y = term1 * invTerm2;
float z = y + 1.0f;
float numerator = FmaImpl(-4.0f, z, absX);
float tmp = FmaImpl(-y, absX, numerator);
float w = FmaImpl(invTerm2, tmp, y);
float poly = FmaImpl(0.0008912171f, w, 0.007045788f);
poly = FmaImpl(poly, w, -0.015866896f);
poly = FmaImpl(poly, w, 0.036429625f);
poly = FmaImpl(poly, w, -0.06664343f);
poly = FmaImpl(poly, w, 0.09381453f);
poly = FmaImpl(poly, w, -0.10099056f);
poly = FmaImpl(poly, w, 0.068094f);
poly = FmaImpl(poly, w, 0.015377387f);
poly = FmaImpl(poly, w, -0.13962108f);
poly = FmaImpl(poly, w, 1.2329951f);
float tmp2 = FmaImpl(2.0f, absX, 1.0f);
float invTmp2 = 1.0f / tmp2;
float q = poly * invTmp2;
float t = FmaImpl(absX, q * -2.0f, poly);
float u = t - q;
float v = FmaImpl(u, invTmp2, q);
float xSquared = absX * absX;
float negX2 = -xSquared;
float f1 = 1.442695f;
float scaled = negX2 * f1;
float intPart = TruncImpl(scaled);
float absPart = AbsImpl(intPart);
uint32_t signBit = *reinterpret_cast<uint32_t *>(&intPart) & 0x80000000;
float clampedBits = signBit | 0x42FC0000;
float clamped = *reinterpret_cast<float *>(&clampedBits);
float safeInt = (absPart > 126.0f) ? clamped : intPart;
float remainder = FmaImpl(safeInt, -0.6931472f, negX2);
remainder = FmaImpl(safeInt, 1.9046542e-9f, remainder);
float exponentArg = remainder * f1;
float exponentBase = safeInt + 12583039.0f;
uint32_t exponentBits = *reinterpret_cast<uint32_t *>(&exponentBase) << 23;
float exponentScale = *reinterpret_cast<float *>(&exponentBits);
float expVal = Exp2Impl(exponentArg) * exponentScale;
float term3 = FmaImpl(-absX, absX, xSquared);
float term4 = FmaImpl(expVal, term3, expVal);
float result = v * term4;
if (absX > 10.055f) {
result = 0.0f;
}
return (x < 0) ? (2.0f - result) : result;
}
#endif
* Calculate the inverse error function of the input x.
* @param x a value
* @return erfinv(x)
* Special cases:
* if x is 0, return 0;
* if x is 1, return Inf;
* if x is -1, return -Inf;
* if x outside [-1, 1], return Nan;
* if x is Nan, return Nan;
*/
template <typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T ErfinvImpl(T x)
{
float oppositeX = -x;
float temp1 = FmaImpl(x, oppositeX, 1.0f);
float log2Temp = Log2Impl(temp1);
float negLog2 = -log2Temp;
if (log2Temp < -8.2f) {
float rsqrtNegLog = RsqrtImpl(negLog2);
float poly = FmaImpl(-0.5899144f, rsqrtNegLog, -0.6630042f);
poly = FmaImpl(poly, rsqrtNegLog, 1.5970111f);
poly = FmaImpl(poly, rsqrtNegLog, -0.67521554f);
poly = FmaImpl(poly, rsqrtNegLog, -0.09522479f);
poly = FmaImpl(poly, rsqrtNegLog, 0.83535343f);
float denominator = 1.0f / rsqrtNegLog;
float finalTerm = denominator * poly;
uint32_t signBit = *reinterpret_cast<uint32_t *>(&x) & 0x80000000;
uint32_t resultBits = signBit | *reinterpret_cast<uint32_t *>(&finalTerm);
return *reinterpret_cast<float *>(&resultBits);
} else {
float poly = FmaImpl(-2.5172708e-10f, negLog2, 9.427429e-9f);
poly = FmaImpl(poly, negLog2, -1.2054752e-7f);
poly = FmaImpl(poly, negLog2, 2.1697005e-7f);
poly = FmaImpl(poly, negLog2, 0.0000080621484f);
poly = FmaImpl(poly, negLog2, -0.000031675492f);
poly = FmaImpl(poly, negLog2, -0.0007743631f);
poly = FmaImpl(poly, negLog2, 0.005546588f);
poly = FmaImpl(poly, negLog2, 0.16082023f);
poly = FmaImpl(poly, negLog2, 0.8862269f);
return poly * x;
}
}
* Calculate the inverse complementary error function of the input x.
* @param x a value
* @return erfcinv(x)
* Special cases:
* if x is 0, return Inf;
* if x is 2, return -Inf;
* if x outside [0, 2], return Nan;
* if x is Nan, return Nan;
*/
template <typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T ErfcinvImpl(T x)
{
float oppositeX = -x;
float term = 2.0f + oppositeX;
if (x <= 1.9966f && x >= 0.0034f) {
float term2 = term * x;
float logTerm = Log2Impl(term2);
float negLog = -logTerm;
float poly = FmaImpl(-2.5172708e-10f, negLog, 9.427429e-9f);
poly = FmaImpl(poly, negLog, -1.2054752e-7f);
poly = FmaImpl(poly, negLog, 2.1697005e-7f);
poly = FmaImpl(poly, negLog, 0.0000080621484f);
poly = FmaImpl(poly, negLog, -0.000031675492f);
poly = FmaImpl(poly, negLog, -0.0007743631f);
poly = FmaImpl(poly, negLog, 0.005546588f);
poly = FmaImpl(poly, negLog, 0.16082023f);
poly = FmaImpl(poly, negLog, 0.8862269f);
return FmaImpl(poly, oppositeX, poly);
} else {
bool isGtOne = x > 1.0f;
float term2 = isGtOne ? term : x;
float logTerm = Log2Impl(term2);
float negLog = -logTerm;
float rsqrtLog = RsqrtImpl(negLog);
float poly = FmaImpl(-63.113224f, rsqrtLog, 127.48469f);
poly = FmaImpl(poly, rsqrtLog, -114.10568f);
poly = FmaImpl(poly, rsqrtLog, 60.325786f);
poly = FmaImpl(poly, rsqrtLog, -21.789892f);
poly = FmaImpl(poly, rsqrtLog, 6.467409f);
poly = FmaImpl(poly, rsqrtLog, -1.8329474f);
poly = FmaImpl(poly, rsqrtLog, -0.030327774f);
poly = FmaImpl(poly, rsqrtLog, 0.83287745f);
float invRsqrt = 1.0f / rsqrtLog;
float signAdj = isGtOne ? -invRsqrt : invRsqrt;
return poly * signAdj;
}
}
* Calculate the scaled complementary error function of the input x.
* @param x a value
* @return erfcx(x)
* Special cases:
* if x is -Inf, return Inf;
* if x is Inf, return +0;
* if x is Nan, return Nan;
*/
template <typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T ErfcxImpl(T x)
{
if (x < -9.43f) {
return ConstantsInternal::SIMT_FP32_INF;
}
float absX = AbsImpl(x);
if (absX < 10.055f) {
float term1 = absX - 4.0f;
float term2 = absX + 4.0f;
float invTerm2 = 1.0f / term2;
float y = term1 * invTerm2;
float z = y + 1.0f;
float numerator = FmaImpl(-4.0f, z, absX);
float tmp = FmaImpl(-y, absX, numerator);
float w = FmaImpl(invTerm2, tmp, y);
float poly = FmaImpl(0.0008912171f, w, 0.007045788f);
poly = FmaImpl(poly, w, -0.015866896f);
poly = FmaImpl(poly, w, 0.036429625f);
poly = FmaImpl(poly, w, -0.06664343f);
poly = FmaImpl(poly, w, 0.09381453f);
poly = FmaImpl(poly, w, -0.10099056f);
poly = FmaImpl(poly, w, 0.068094f);
poly = FmaImpl(poly, w, 0.015377387f);
poly = FmaImpl(poly, w, -0.13962108f);
poly = FmaImpl(poly, w, 1.2329951f);
float term3 = FmaImpl(2.0f, absX, 1.0f);
float invTerm3 = 1.0f / term3;
float q = poly * invTerm3;
float t = FmaImpl(absX, q * -2.0f, poly);
float u = t - q;
float result = FmaImpl(u, invTerm3, q);
if (x > 0) {
return result;
}
float xSq = absX * absX;
float negX2 = -xSq;
float term4 = FmaImpl(absX, absX, negX2);
float term5 = FmaImpl(xSq, 0.00572498f, 0.5f);
term5 = MinImpl(term5, ConstantsInternal::SIMT_FP32_INF);
float term6 = FmaImpl(term5, 252.0f, 12582913.0f);
float term7 = term6 -12583039.0f;
float negTerm7 = -term7;
float term8 = FmaImpl(xSq, 1.442695f, negTerm7);
float term9 = FmaImpl(xSq, 1.925963e-8f, term8);
uint32_t exponent = *reinterpret_cast<uint32_t *>(&term6) << 23;
float exponentScale = *reinterpret_cast<float *>(&exponent);
float term9Exp = Exp2Impl(term9);
float scaledExp = term9Exp * exponentScale;
float expApprox = FmaImpl(term9Exp, exponentScale, scaledExp);
float finalExpApprox = FmaImpl(expApprox, term4, expApprox);
bool isInf = IsInfImpl(expApprox);
result = isInf ? expApprox : (finalExpApprox - result);
return result;
} else {
float scaledX = absX * 0.25f;
float reciprocalX = 0.25f / scaledX;
float w = reciprocalX * reciprocalX;
float poly = FmaImpl(6.5625f, w, -1.875f);
poly = FmaImpl(poly, w, 0.75f);
poly = FmaImpl(poly, w, -0.5f);
poly = FmaImpl(poly, w, 1.0f);
float scaledReciprocal = reciprocalX * 0.5641896f;
float result = scaledReciprocal * poly;
return result;
}
}
__SIMT_DEVICE_FUNCTIONS_DECL__ inline float ComputeSinpi(float x)
{
float doubleX = x * 2;
float y0 = static_cast<float>(NearByIntImpl(doubleX));
int32_t i = static_cast<int32_t>(y0);
float f = FmaImpl(-y0, 0.5f, x);
float fPi = f * 3.14159274f;
float fPiSquare = fPi * fPi;
float y = 0.0f;
if ((i & 1) != 0) {
y = 2.42795795e-05f;
y = FmaImpl(y, fPiSquare, -0.00138878601f);
y = FmaImpl(y, fPiSquare, 0.0416667275f);
y = FmaImpl(y, fPiSquare, -0.49999997f);
float y2 = FmaImpl(fPiSquare, 1.0f, 0.0f);
y = FmaImpl(y, y2, 1.0f);
} else {
y = -0.000195746587f;
y = FmaImpl(y, fPiSquare, 0.00833270326f);
y = FmaImpl(y, fPiSquare, -0.166666627f);
float y2 = FmaImpl(fPiSquare, fPi, 0.0f);
y = FmaImpl(y, y2, fPi);
}
if ((i & 2) != 0) {
y = FmaImpl(y, -1.0f, 0.0f);
}
return y;
}
__SIMT_DEVICE_FUNCTIONS_DECL__ inline float ComputeLn(float x)
{
float offset = 0;
if (x < 1.17549435e-38f) {
offset = -23;
x = x * 8388608;
}
uint32_t u32 = *reinterpret_cast<uint32_t*>(&x);
int32_t y1 = (u32 - 1059760811) & -8388608;
int32_t y2 = u32 - y1;
float mantissa = *reinterpret_cast<float*>(&y2);
mantissa = mantissa - 1.0f;
float exponent = FmaImpl(static_cast<float>(y1), 1.1920929e-07f, offset);
float y = -0.130188569f;
y = FmaImpl(y, mantissa, 0.140846103f);
y = FmaImpl(y, mantissa, -0.121486276f);
y = FmaImpl(y, mantissa, 0.139806107f);
y = FmaImpl(y, mantissa, -0.166842356f);
y = FmaImpl(y, mantissa, 0.200122997f);
y = FmaImpl(y, mantissa, -0.249996692f);
y = FmaImpl(y, mantissa, 0.333331823f);
y = FmaImpl(y, mantissa, -0.5f);
y = mantissa * y;
y = FmaImpl(y, mantissa, mantissa);
y = FmaImpl(exponent, 0.693147182f, y);
if (u32 >= ConstantsInternal::SIMT_INT32_INF || x == 0) {
y = FmaImpl(x, ConstantsInternal::SIMT_FP32_INF, ConstantsInternal::SIMT_FP32_INF);
}
return y;
}
* Calculates gamma value by input x, x in (-1.5, 1.5).
*
* @param x a value
* @return x's gamma-value
*/
__SIMT_DEVICE_FUNCTIONS_DECL__ inline float EulerGammaFunction(float x)
{
float frac = x - NearByIntImpl(x);
float y = -0.00107286568f;
y = FmaImpl(y, frac, 0.00711105345f);
y = FmaImpl(frac, y, -0.0096437186f);
y = FmaImpl(frac, y, -0.042180188f);
y = FmaImpl(frac, y, 0.166540906f);
y = FmaImpl(frac, y, -0.0420036502f);
y = FmaImpl(frac, y, -0.655878186f);
y = FmaImpl(frac, y, 0.577215672f);
y = FmaImpl(frac, y, 1.0f);
if (x < -0.5f) {
y = y * x * frac;
}
if (x <= 0.5f && x >= -0.5f) {
y = y * frac;
}
if (AbsImpl(y) < 1.1754943e-38f) {
int32_t e = 0;
float m = FrexpImpl(y, e);
return LdexpImpl(1.0f / m, 0 - e);
} else {
return 1.0f / y;
}
}
* Calculates gamma value by input x, x in (-inf, -1.5] or [1.5, inf).
*
* Formula:
* f(x) = (x-1)! = sqrt(2*pi*x) * [(x/e)^x] * [1 + 1/(12*x) + 1/(288*x^2) + O(3)] / x, x >= 1.5
* f(x) = pi / [sin(pi*x) * f(1-x)], x <= -1.5
* @param x a value
* @return x's gamma-value
*/
__SIMT_DEVICE_FUNCTIONS_DECL__ inline float StirlingAndEulerReflection(float x)
{
float absX= AbsImpl(x);
if (absX > 41.0999985f) {
x = CopySignImpl(41.0999985f, x);
absX = AbsImpl(x);
}
uint32_t u32 = reinterpret_cast<uint32_t &>(absX);
int32_t expU32 = (u32 - 1060439283) & 0xFF800000;
int32_t manU32 = u32 - expU32;
float mantissa = *reinterpret_cast<float *>(&manU32);
float exponent = FmaImpl(static_cast<float>(expU32), 1.1920929e-07f, 0.0f);
float lnMantissa = 2.0f / (mantissa + 1.0f) * (mantissa - 1.0f);
float logX = FmaImpl(lnMantissa, 1.44269502f, exponent);
float logXDiff = FmaImpl(lnMantissa, 1.44269502f, exponent - logX);
float y3 = 0.000656886259f;
y3 = FmaImpl(y3, lnMantissa * lnMantissa, 0.00321816537f);
y3 = FmaImpl(y3, lnMantissa * lnMantissa, 0.0180337187f);
y3 = FmaImpl(y3, lnMantissa * lnMantissa, 0.120224588f);
y3 = FmaImpl(y3, lnMantissa * lnMantissa, 0.0f);
float r = 2.0f * (mantissa - 1.0f - lnMantissa) - lnMantissa * (mantissa - 1.0f);
logXDiff = FmaImpl(1.0f / (mantissa + 1.0f) * r, 1.44269502f, logXDiff);
logXDiff = FmaImpl(lnMantissa, 1.92513667e-08f, logXDiff);
logXDiff = FmaImpl(y3, lnMantissa, logXDiff);
float diff0 = logX - (logX + logXDiff) + logXDiff;
logX = logX + logXDiff;
float y01 = logX * (absX - 0.5f);
float y02 = 1.44269502f * absX;
float y0 = y01 - y02;
float diff1 = FmaImpl(logX, absX - 0.5f, -y01);
diff1 = FmaImpl(diff0, absX - 0.5f, diff1);
float diff2 = FmaImpl(1.44269502f, absX, -y02);
diff2 = FmaImpl(1.92596303e-08f, absX, diff2);
float y0Diff = (diff1 - diff2) - (y0 - y01 + y02);
float offset = 0.0f;
if (absX > 33.0f) {
offset = 48.0f;
}
if (x < 0.0f) {
y0 = offset - y0;
y0Diff = -y0Diff;
}
float i = NearByIntImpl(y0);
float f = y0 - i + y0Diff;
float y5 = PowImpl(2.0f, f) * PowImpl(2.0f, i) * 2.5066282f;
float recAbsX = 1.0f / absX;
float y6 = 0.000068413915f;
y6 = FmaImpl(y6, recAbsX, -0.000050603266f);
y6 = FmaImpl(y6, recAbsX, -0.00042276637f);
y6 = FmaImpl(y6, recAbsX, 0.0009921414f);
y6 = FmaImpl(y6, recAbsX, -0.00027855476f);
y6 = FmaImpl(y6, recAbsX, -0.002674901f);
y6 = FmaImpl(y6, recAbsX, 0.0034718033f);
y6 = FmaImpl(y6, recAbsX, 0.08333334f);
y6 = FmaImpl(y6, recAbsX, 0.0f);
if (x > 0) {
return FmaImpl(y5, y6, y5);
} else {
y6 = (y6 + 1);
float sinpi = ComputeSinpi(absX);
float yDiff = FmaImpl(y6 * x, sinpi, -y6 * x * sinpi);
float y7 = 1 / (y6 * x * sinpi);
float y = FmaImpl(y5, y7, -y5 * y7 * yDiff * y7);
y = y * 0.5f;
if (absX > 33) {
y = y * 3.5527136e-15f;
}
return y;
}
}
* Calculates gamma value by input x.
* @param x a value
* @return tgamma(x)
* Special cases:
* if x is 0, return Inf
* if x is Nan, return Nan;
* if x is Inf, return Inf;
* if x is -Inf, return nan;
*/
template<typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T TgammaImpl(T x)
{
if (x == 0.0f) {
return 1.0f / x;
}
if (x < 0.0f && NearByIntImpl(x) == x) {
return ConstantsInternal::SIMT_FP32_INF / ConstantsInternal::SIMT_FP32_INF;
}
float absX = AbsImpl(x);
if (absX < 1.5f) {
return EulerGammaFunction(x);
} else {
return StirlingAndEulerReflection(x);
}
}
* Calculates lgamma value by input x.
* @param x a value
* @return lgamma(x)
* Special cases:
* if x is 0, return Inf
* if x is Nan, return Nan;
* if x is Inf, return Inf;
* if x is -Inf, return Inf;
*/
template<typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T LgammaImpl(T x)
{
float absX = AbsImpl(x);
float result = 0.0f;
if (IsInfImpl(absX)) {
return absX;
} else if (absX < 0.7f) {
float y0 = 0.0035875155f;
y0 = FmaImpl(y0, absX, -0.0054712854f);
y0 = FmaImpl(y0, absX, -0.044627126f);
y0 = FmaImpl(y0, absX, 0.1673177f);
y0 = FmaImpl(y0, absX, -0.04213598f);
y0 = FmaImpl(y0, absX, -0.6558673f);
y0 = FmaImpl(y0, absX, 0.5772154f);
y0 = FmaImpl(y0, absX, 0.0f);
y0 = FmaImpl(y0, absX, absX);
result = ComputeLn(y0);
result = -result;
if (y0 == 0.0f) {
result = 1.0f / 0.0f;
}
} else if (absX < 1.5f) {
float oneMinusX = 1.0f - absX;
result = 0.045882664f;
result = FmaImpl(result, oneMinusX, 0.10373967f);
result = FmaImpl(result, oneMinusX, 0.122803635f);
result = FmaImpl(result, oneMinusX, 0.12752421f);
result = FmaImpl(result, oneMinusX, 0.14321668f);
result = FmaImpl(result, oneMinusX, 0.16934357f);
result = FmaImpl(result, oneMinusX, 0.20740793f);
result = FmaImpl(result, oneMinusX, 0.2705875f);
result = FmaImpl(result, oneMinusX, 0.40068542f);
result = FmaImpl(result, oneMinusX, 0.82246696f);
result = FmaImpl(result, oneMinusX, 0.5772157f);
result = FmaImpl(result, oneMinusX, 0.0f);
} else if (absX < 3.0f) {
float xMinusTwo = absX - 2.0f;
result = 0.0000495984932f;
result = FmaImpl(result, xMinusTwo, -0.00022089484f);
result = FmaImpl(result, xMinusTwo, 0.000541314250f);
result = FmaImpl(result, xMinusTwo, -0.00120451697f);
result = FmaImpl(result, xMinusTwo, 0.00288425176f);
result = FmaImpl(result, xMinusTwo, -0.00738275796f);
result = FmaImpl(result, xMinusTwo, 0.0205813199f);
result = FmaImpl(result, xMinusTwo, -0.0673524886f);
result = FmaImpl(result, xMinusTwo, 0.322467029f);
result = FmaImpl(result, xMinusTwo, 0.42278432f);
result = FmaImpl(result, absX, -result - result);
} else if (absX < 7.8f) {
float xMinusThree = absX - 3.0f;
float a0 = -143033.4f, a1 = -48310.664f, a2 = -41061.375f, a3 = -12349.742f, a4 = -748.8903f;
float b0 = -206353.58f, b1 = -92685.05f, b2 = -10777.18f, b3 = -259.25097f;
float y0 = FmaImpl(a4, xMinusThree, a3);
y0 = FmaImpl(y0, xMinusThree, a2);
y0 = FmaImpl(y0, xMinusThree, a1);
y0 = FmaImpl(y0, xMinusThree, a0);
float y1 = FmaImpl(1.0f, xMinusThree, b3);
y1 = FmaImpl(y1, xMinusThree, b2);
y1 = FmaImpl(y1, xMinusThree, b1);
y1 = FmaImpl(y1, xMinusThree, b0);
result = FmaImpl(y0, 1.0f / y1, xMinusThree);
} else {
float y0 = (1.0f / absX);
float y1 = y0 * y0;
float y2 = 0.00077783066f;
y2 = FmaImpl(y2, y1, -0.0027776553f);
y2 = FmaImpl(y2, y1, 0.083333276f);
y2 = FmaImpl(y2, y0, 0.0f);
float y3 = ComputeLn(absX) * 0.5f * (absX - 0.5f);
result = y3 - absX + y3 + y2 + 0.9189385f;
}
if (x < 0) {
if (FloorIntrinsicsImpl(absX) == absX) {
return ConstantsInternal::SIMT_FP32_INF;
} else if (absX < 9.9999996e-20f) {
result = -ComputeLn(absX);
} else {
float sinpi = ComputeSinpi(absX);
float lnXSinpi = ComputeLn(absX * AbsImpl(sinpi));
float y = 1.14472985f - lnXSinpi;
result = FmaImpl(y, 1.0f, -result);
}
}
return result;
}
* Calculates CylBesselI0 value by input x.
* @param x a value
* @return CylBesselI0(x)
* Special cases:
* if x is 0, return 1;
* if x is Nan, return Nan;
* if x is Inf, return Inf;
* if x is -Inf, return Inf;
*/
template<typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T CylBesselI0Impl(T x)
{
float absX = AbsImpl(x);
if (IsInfImpl(absX)) {
return absX;
}
if (absX >= 9) {
float reciprocalX = 1.0f / absX;
float y = 0.34872168f;
y = FmaImpl(y, reciprocalX, -0.0054563344f);
y = FmaImpl(y, reciprocalX, 0.033347155f);
y = FmaImpl(y, reciprocalX, 0.027889195f);
y = FmaImpl(y, reciprocalX, 0.04987063f);
y = FmaImpl(y, reciprocalX, 0.39894226f);
y = y * RsqrtImpl(absX);
return y * (ExpImpl(absX * 0.5f) - 1) * (ExpImpl(absX * 0.5f) + 1) + y;
} else {
float squareX = absX * absX;
float y = 1.551427e-19;
y = FmaImpl(y, squareX, 1.4492505e-17f);
y = FmaImpl(y, squareX, 1.0687647e-14f);
y = FmaImpl(y, squareX, 2.3349575e-12f);
y = FmaImpl(y, squareX, 4.7306625e-10f);
y = FmaImpl(y, squareX, 6.7778003e-8f);
y = FmaImpl(y, squareX, 0.0000067820783f);
y = FmaImpl(y, squareX, 0.00043402583f);
y = FmaImpl(y, squareX, 0.015625f);
y = FmaImpl(y, squareX, 0.25f);
y = FmaImpl(y, squareX, 1);
return y;
}
}
* Calculates CylBesselI1 value by input x.
* @param x a value
* @return CylBesselI1(x)
* Special cases:
* if x is 0, return 0;
* if x is Nan, return Nan;
* if x is Inf, return Inf;
* if x is -Inf, return -Inf;
*/
template<typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T CylBesselI1Impl(T x)
{
float absX = AbsImpl(x);
if (IsInfImpl(absX)) {
return x;
}
if (IsNanImpl(absX)) {
return absX;
}
if (absX >= 8.085f) {
float reciprocalX = 1.0f / absX;
float y = -0.5028813f;
y = FmaImpl(y, reciprocalX, 0.028471555f);
y = FmaImpl(y, reciprocalX, -0.04873671f);
y = FmaImpl(y, reciprocalX, -0.04641596f);
y = FmaImpl(y, reciprocalX, -0.14960973f);
y = FmaImpl(y, reciprocalX, 0.39894232f);
y = y * RsqrtImpl(absX);
y = y * (ExpImpl(absX * 0.5f) - 1) * (ExpImpl(absX * 0.5f) + 1) + y;
return CopySignImpl(y, x);
} else {
float squareX = x * x;
float y = 2.7848253e-18f;
y = FmaImpl(y, squareX, 3.4224707e-16f);
y = FmaImpl(y, squareX, 1.6258002e-13f);
y = FmaImpl(y, squareX, 3.3142173e-11f);
y = FmaImpl(y, squareX, 5.6632734e-9f);
y = FmaImpl(y, squareX, 6.780027e-7f);
y = FmaImpl(y, squareX, 0.00005425474f);
y = FmaImpl(y, squareX, 0.002604162f);
y = FmaImpl(y, squareX, 0.0625000f);
y = FmaImpl(y, squareX, 0.5f);
return y * x;
}
}
template<typename T>
__SIMT_DEVICE_FUNCTIONS_DECL__ inline T NormcdfImpl(T x)
{
if (AbsImpl(x) > 14.5f) {
x = CopySignImpl(14.5f, x);
}
float oneOverSqrt2High = -0.707106769f;
float xOverSqrt2High = x * oneOverSqrt2High;
float compensateValue = FmaImpl(x, oneOverSqrt2High, -xOverSqrt2High);
float oneOverSqrt2Low = -1.21016175e-8f;
float xOverSqrt2Low = FmaImpl(x, oneOverSqrt2Low, compensateValue);
float xOverSqrt2 = xOverSqrt2High + xOverSqrt2Low;
float erfcValue = ErfcImpl(xOverSqrt2);
if (x <= -1.0f) {
erfcValue = FmaImpl(-2.0f * xOverSqrt2 * erfcValue,
xOverSqrt2High - xOverSqrt2 + xOverSqrt2Low, erfcValue);
}
return 0.5f * erfcValue;
}
}
}
#endif