/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <immintrin.h>
#include <Columns/IColumn.h>
#include <DataTypes/IDataType.h>
#include <base/Decimal.h>
#include <base/extended_types.h>
#include <benchmark/benchmark.h>
#include <Common/PODArray.h>
#include <Common/TargetSpecific.h>

using namespace DB;

/// Uses addOverflow method (if available) to avoid UB for sumWithOverflow()
///
/// Since NO_SANITIZE_UNDEFINED works only for the function itself, without
/// callers, and in case of non-POD type (i.e. Decimal) you have overwritten
/// operator+=(), which will have UB.
template <typename T>
struct MyAdd
{
    static void NO_SANITIZE_UNDEFINED ALWAYS_INLINE add(T & lhs, const T & rhs) { lhs += rhs; }
};
template <typename DecimalNativeType>
struct MyAdd<Decimal<DecimalNativeType>>
{
    static void NO_SANITIZE_UNDEFINED ALWAYS_INLINE add(Decimal<DecimalNativeType> & lhs, const Decimal<DecimalNativeType> & rhs)
    {
        lhs.addOverflow(rhs);
    }
};

// _Pragma("clang attribute push(__attribute__((target(\"sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,bmi2\"))),apply_to=function)")

template <typename T>
struct MySumData
{
    using Impl = MyAdd<T>;
    T sum{};

    MULTITARGET_FUNCTION_AVX512BW_AVX512F_AVX2_SSE42(
        MULTITARGET_FUNCTION_HEADER(template <typename Value, bool add_if_zero> void NO_SANITIZE_UNDEFINED NO_INLINE),
        addManyConditionalInternalImpl,
        MULTITARGET_FUNCTION_BODY((
            const Value * __restrict ptr, const UInt8 * __restrict condition_map, size_t start, size_t end) /// NOLINT
                                  {
                                      ptr += start;
                                      condition_map += start;
                                      size_t count = end - start;
                                      const auto * end_ptr = ptr + count;

                                      if constexpr (
                                          (is_integer<T> && !is_big_int_v<T>)
                                          || (is_decimal<T> && !std::is_same_v<T, Decimal256> && !std::is_same_v<T, Decimal128>))
                                      {
                                          /// For integers we can vectorize the operation if we replace the null check using a multiplication (by 0 for null, 1 for not null)
                                          /// https://quick-bench.com/q/MLTnfTvwC2qZFVeWHfOBR3U7a8I
                                          T local_sum{};
                                          while (ptr < end_ptr)
                                          {
                                              T multiplier = !*condition_map == add_if_zero;
                                              Impl::add(local_sum, *ptr * multiplier);
                                              ++ptr;
                                              ++condition_map;
                                          }
                                          Impl::add(sum, local_sum);
                                          return;
                                      }

                                      if constexpr (std::is_floating_point_v<T>)
                                      {
                                          /// For floating point we use a similar trick as above, except that now we  reinterpret the floating point number as an unsigned
                                          /// integer of the same size and use a mask instead (0 to discard, 0xFF..FF to keep)
                                          static_assert(sizeof(Value) == 4 || sizeof(Value) == 8);
                                          using equivalent_integer = typename std::conditional_t<sizeof(Value) == 4, UInt32, UInt64>;

                                          constexpr size_t unroll_count = 128 / sizeof(T);
                                          T partial_sums[unroll_count]{};

                                          const auto * unrolled_end = ptr + (count / unroll_count * unroll_count);

                                          while (ptr < unrolled_end)
                                          {
                                              for (size_t i = 0; i < unroll_count; ++i)
                                              {
                                                  equivalent_integer value;
                                                  std::memcpy(&value, &ptr[i], sizeof(Value));
                                                  value &= (!condition_map[i] != add_if_zero) - 1;
                                                  Value d;
                                                  std::memcpy(&d, &value, sizeof(Value));
                                                  Impl::add(partial_sums[i], d);
                                              }
                                              ptr += unroll_count;
                                              condition_map += unroll_count;
                                          }

                                          for (size_t i = 0; i < unroll_count; ++i)
                                              Impl::add(sum, partial_sums[i]);
                                      }

                                      T local_sum{};
                                      while (ptr < end_ptr)
                                      {
                                          if (!*condition_map == add_if_zero)
                                              Impl::add(local_sum, *ptr);
                                          ++ptr;
                                          ++condition_map;
                                      }
                                      Impl::add(sum, local_sum);
                                  }))

    /// Vectorized version
    template <typename Value, bool add_if_zero>
    void NO_INLINE
    addManyConditionalInternal(const Value * __restrict ptr, const UInt8 * __restrict condition_map, size_t start, size_t end)
    {
#if USE_MULTITARGET_CODE
        if (isArchSupported(TargetArch::AVX512BW))
        {
            addManyConditionalInternalImplAVX512BW<Value, add_if_zero>(ptr, condition_map, start, end);
            return;
        }

        if (isArchSupported(TargetArch::AVX512F))
        {
            addManyConditionalInternalImplAVX512F<Value, add_if_zero>(ptr, condition_map, start, end);
            return;
        }

        if (isArchSupported(TargetArch::AVX2))
        {
            addManyConditionalInternalImplAVX2<Value, add_if_zero>(ptr, condition_map, start, end);
            return;
        }

        if (isArchSupported(TargetArch::SSE42))
        {
            addManyConditionalInternalImplSSE42<Value, add_if_zero>(ptr, condition_map, start, end);
            return;
        }
#endif
        addManyConditionalInternalImpl<Value, add_if_zero>(ptr, condition_map, start, end);
    }

    MULTITARGET_FUNCTION_AVX512BW_AVX512F_AVX2_SSE42(
        MULTITARGET_FUNCTION_HEADER(template <typename Value, bool add_if_zero> void NO_SANITIZE_UNDEFINED NO_INLINE),
        addManyConditionalInternalImplNew,
        MULTITARGET_FUNCTION_BODY((
            const Value * __restrict ptr, const UInt8 * __restrict condition_map, size_t start, size_t end) /// NOLINT
                                  {
                                      ptr += start;
                                      condition_map += start;
                                      size_t count = end - start;
                                      const auto * end_ptr = ptr + count;

                                      if constexpr ((is_integer<T> || is_decimal<T>)&&!is_over_big_int<T>)
                                      {
                                          /// For integers we can vectorize the operation if we replace the null check using a multiplication (by 0 for null, 1 for not null)
                                          /// https://quick-bench.com/q/MLTnfTvwC2qZFVeWHfOBR3U7a8I
                                          T local_sum{};
                                          while (ptr < end_ptr)
                                          {
                                              T multiplier = !*condition_map == add_if_zero;
                                              Impl::add(local_sum, *ptr * multiplier);
                                              ++ptr;
                                              ++condition_map;
                                          }
                                          Impl::add(sum, local_sum);
                                          return;
                                      }
                                      else if constexpr (is_integer<T>)
                                      {
                                          T local_sum{};
                                          using MaskType = std::conditional_t<sizeof(T) == 16, Int8, Int8>;
                                          alignas(64) const MaskType masks[2] = {0, -1};
                                          while (ptr < end_ptr)
                                          {
                                              Value v = *ptr;
                                              if constexpr (!add_if_zero)
                                                  v &= masks[!!*condition_map];
                                              else
                                                  v &= masks[!*condition_map];

                                              Impl::add(local_sum, v);
                                              ++ptr;
                                              ++condition_map;
                                          }
                                          Impl::add(sum, local_sum);
                                          return;
                                      }
                                      else if constexpr (is_decimal<T>)
                                      {
                                          T local_sum{};
                                          using MaskType = std::conditional_t<sizeof(T) == 16, Int8, Int8>;
                                          alignas(64) const MaskType masks[2] = {0, -1};
                                          while (ptr < end_ptr)
                                          {
                                              Value v = *ptr;
                                              if constexpr (!add_if_zero)
                                                  v.value &= masks[!!*condition_map];
                                              else
                                                  v.value &= masks[!*condition_map];

                                              Impl::add(local_sum, v);
                                              ++ptr;
                                              ++condition_map;
                                          }
                                          Impl::add(sum, local_sum);
                                          return;
                                      }
                                      else if constexpr (std::is_floating_point_v<T>)
                                      {
                                          /// For floating point we use a similar trick as above, except that now we  reinterpret the floating point number as an unsigned
                                          /// integer of the same size and use a mask instead (0 to discard, 0xFF..FF to keep)
                                          static_assert(sizeof(Value) == 4 || sizeof(Value) == 8);
                                          using equivalent_integer = typename std::conditional_t<sizeof(Value) == 4, UInt32, UInt64>;

                                          constexpr size_t unroll_count = 128 / sizeof(T);
                                          T partial_sums[unroll_count]{};

                                          const auto * unrolled_end = ptr + (count / unroll_count * unroll_count);

                                          while (ptr < unrolled_end)
                                          {
                                              for (size_t i = 0; i < unroll_count; ++i)
                                              {
                                                  equivalent_integer value;
                                                  std::memcpy(&value, &ptr[i], sizeof(Value));
                                                  value &= (!condition_map[i] != add_if_zero) - 1;
                                                  Value d;
                                                  std::memcpy(&d, &value, sizeof(Value));
                                                  Impl::add(partial_sums[i], d);
                                              }
                                              ptr += unroll_count;
                                              condition_map += unroll_count;
                                          }

                                          for (size_t i = 0; i < unroll_count; ++i)
                                              Impl::add(sum, partial_sums[i]);
                                      }

                                      T local_sum{};
                                      while (ptr < end_ptr)
                                      {
                                          Impl::add(local_sum, !*condition_map == add_if_zero ? *ptr : T{});
                                          ++ptr;
                                          ++condition_map;
                                      }
                                      Impl::add(sum, local_sum);
                                  }))

    /// Vectorized version
    template <typename Value, bool add_if_zero>
    void NO_INLINE
    addManyConditionalInternalNew(const Value * __restrict ptr, const UInt8 * __restrict condition_map, size_t start, size_t end)
    {
#if USE_MULTITARGET_CODE
        if (isArchSupported(TargetArch::AVX512BW))
        {
            addManyConditionalInternalImplNewAVX512BW<Value, add_if_zero>(ptr, condition_map, start, end);
            return;
        }

        if (isArchSupported(TargetArch::AVX512F))
        {
            addManyConditionalInternalImplNewAVX512F<Value, add_if_zero>(ptr, condition_map, start, end);
            return;
        }

        if (isArchSupported(TargetArch::AVX2))
        {
            addManyConditionalInternalImplNewAVX2<Value, add_if_zero>(ptr, condition_map, start, end);
            return;
        }

        if (isArchSupported(TargetArch::SSE42))
        {
            addManyConditionalInternalImplNewSSE42<Value, add_if_zero>(ptr, condition_map, start, end);
            return;
        }
#endif
        addManyConditionalInternalImplNew<Value, add_if_zero>(ptr, condition_map, start, end);
    }

    /*
    template <typename Value, bool add_if_zero>
    void NO_SANITIZE_UNDEFINED NO_INLINE addManyConditionalInternalImplSIMD(
        const Value * __restrict ptr, const UInt8 * __restrict condition_map, size_t start, size_t end) /// NOLINT
    {
        ptr += start;
        condition_map += start;
        size_t count = end - start;
        const auto * end_ptr = ptr + count;

        if constexpr ((is_integer<T> || is_decimal<T>)&&!is_over_big_int<T>)
        {
            /// For integers we can vectorize the operation if we replace the null check using a multiplication (by 0 for null, 1 for not null)
            /// https://quick-bench.com/q/MLTnfTvwC2qZFVeWHfOBR3U7a8I
            T local_sum{};
            while (ptr < end_ptr)
            {
                T multiplier = !*condition_map == add_if_zero;
                Impl::add(local_sum, *ptr * multiplier);
                ++ptr;
                ++condition_map;
            }
            Impl::add(sum, local_sum);
            return;
        }
        else if constexpr (is_integer<T>)
        {
            T local_sum{};
            using MaskType = std::conditional_t<sizeof(T) == 16, Int8, Int64>;
            alignas(64) const MaskType masks[2] = {0, -1};
            while (ptr < end_ptr)
            {
                T value = *ptr;
                if constexpr (sizeof(T) == 16)
                {
                    __m128i v = _mm_loadu_si128((__m128i *)&value);
                    __m128i c = _mm_set1_epi8(!*condition_map == add_if_zero);
                    __m128i r = _mm_and_si128(v, c);
                    _mm_storeu_si128((__m128i *)&value, r);
                }
                else
                {
                    __m256i v = _mm256_loadu_si256((__m256i *)&value);
                    __m256i c = _mm256_set1_epi8(!*condition_map == add_if_zero);
                    __m256i r = _mm256_and_si256(v, c);
                    _mm256_storeu_si256((__m256i *)&value, r);
                }

                Impl::add(local_sum, value);
                ++ptr;
                ++condition_map;
            }
            Impl::add(sum, local_sum);
            return;
        }
        else if constexpr (is_decimal<T>)
        {
            T local_sum{};
            using MaskType = std::conditional_t<sizeof(T) == 16, Int8, Int64>;
            alignas(64) const MaskType masks[2] = {0, -1};
            while (ptr < end_ptr)
            {
                Value v = *ptr;
                if constexpr (!add_if_zero)
                    v.value &= masks[*condition_map];
                else
                    v.value &= masks[!*condition_map];

                Impl::add(local_sum, v);
                ++ptr;
                ++condition_map;
            }
            Impl::add(sum, local_sum);
            return;
        }
        else if constexpr (std::is_floating_point_v<T>)
        {
            /// For floating point we use a similar trick as above, except that now we  reinterpret the floating point number as an unsigned
            /// integer of the same size and use a mask instead (0 to discard, 0xFF..FF to keep)
            static_assert(sizeof(Value) == 4 || sizeof(Value) == 8);
            using equivalent_integer = typename std::conditional_t<sizeof(Value) == 4, UInt32, UInt64>;

            constexpr size_t unroll_count = 128 / sizeof(T);
            T partial_sums[unroll_count]{};

            const auto * unrolled_end = ptr + (count / unroll_count * unroll_count);

            while (ptr < unrolled_end)
            {
                for (size_t i = 0; i < unroll_count; ++i)
                {
                    equivalent_integer value;
                    std::memcpy(&value, &ptr[i], sizeof(Value));
                    value &= (!condition_map[i] != add_if_zero) - 1;
                    Value d;
                    std::memcpy(&d, &value, sizeof(Value));
                    Impl::add(partial_sums[i], d);
                }
                ptr += unroll_count;
                condition_map += unroll_count;
            }

            for (size_t i = 0; i < unroll_count; ++i)
                Impl::add(sum, partial_sums[i]);
        }

        T local_sum{};
        while (ptr < end_ptr)
        {
            Impl::add(local_sum, !*condition_map == add_if_zero ? *ptr : T{});
            ++ptr;
            ++condition_map;
        }
        Impl::add(sum, local_sum);
    }

    template <typename Value, bool add_if_zero>
    void NO_INLINE
    addManyConditionalInternalSIMD(const Value * __restrict ptr, const UInt8 * __restrict condition_map, size_t start, size_t end)
    {
        addManyConditionalInternalImplSIMD<Value, add_if_zero>(ptr, condition_map, start, end);
    }
    */
};

// _Pragma("clang attribute pop")

static constexpr size_t ROWS = 65536;

static void initCondition(PaddedPODArray<UInt8> & cond)
{
    cond.resize(ROWS);
    for (size_t i = 0; i < ROWS; ++i)
        cond[i] = std::rand() % 2;
}

template <typename T>
static void initColumn(PaddedPODArray<T> & data)
{
    data.resize(ROWS);
    for (size_t i = 0; i < ROWS; ++i)
        data[i] = static_cast<T>(std::rand());
}

template <typename T>
static void BM_SumWithCondition(benchmark::State & state)
{
    PaddedPODArray<T> data;
    initColumn(data);
    PaddedPODArray<UInt8> cond;
    initCondition(cond);

    for (auto _ : state)
    {
        MySumData<T> sum_data;
        sum_data.template addManyConditionalInternal<T, false>(data.data(), cond.data(), 0, ROWS);
        benchmark::DoNotOptimize(sum_data);
    }
}

template <typename T>
static void BM_SumWithConditionNew(benchmark::State & state)
{
    PaddedPODArray<T> data;
    initColumn(data);
    PaddedPODArray<UInt8> cond;
    initCondition(cond);

    for (auto _ : state)
    {
        MySumData<T> sum_data;
        sum_data.template addManyConditionalInternalNew<T, false>(data.data(), cond.data(), 0, ROWS);
        benchmark::DoNotOptimize(sum_data);
    }
}


/*
template <typename T>
static void BM_SumWithConditionSIMD(benchmark::State & state)
{
    PaddedPODArray<T> data;
    initColumn(data);
    PaddedPODArray<UInt8> cond;
    initCondition(cond);

    for (auto _ : state)
    {
        MySumData<T> sum_data;
        sum_data.template addManyConditionalInternalSIMD<T, false>(data.data(), cond.data(), 0, ROWS);
        benchmark::DoNotOptimize(sum_data);
    }
}
*/

BENCHMARK_TEMPLATE(BM_SumWithCondition, Int64);
BENCHMARK_TEMPLATE(BM_SumWithConditionNew, Int64);
BENCHMARK_TEMPLATE(BM_SumWithCondition, UInt64);
BENCHMARK_TEMPLATE(BM_SumWithConditionNew, UInt64);
BENCHMARK_TEMPLATE(BM_SumWithCondition, Float64);
BENCHMARK_TEMPLATE(BM_SumWithConditionNew, Float64);

BENCHMARK_TEMPLATE(BM_SumWithCondition, Int128);
BENCHMARK_TEMPLATE(BM_SumWithConditionNew, Int128);
// BENCHMARK_TEMPLATE(BM_SumWithConditionSIMD, Int128);

BENCHMARK_TEMPLATE(BM_SumWithCondition, UInt128);
BENCHMARK_TEMPLATE(BM_SumWithConditionNew, UInt128);
// BENCHMARK_TEMPLATE(BM_SumWithConditionSIMD, UInt128);

BENCHMARK_TEMPLATE(BM_SumWithCondition, Int256);
BENCHMARK_TEMPLATE(BM_SumWithConditionNew, Int256);
// BENCHMARK_TEMPLATE(BM_SumWithConditionSIMD, Int256);

BENCHMARK_TEMPLATE(BM_SumWithCondition, UInt256);
BENCHMARK_TEMPLATE(BM_SumWithConditionNew, UInt256);
// BENCHMARK_TEMPLATE(BM_SumWithConditionSIMD, UInt256);

BENCHMARK_TEMPLATE(BM_SumWithCondition, Decimal32);
BENCHMARK_TEMPLATE(BM_SumWithConditionNew, Decimal32);

BENCHMARK_TEMPLATE(BM_SumWithCondition, Decimal64);
BENCHMARK_TEMPLATE(BM_SumWithConditionNew, Decimal64);

BENCHMARK_TEMPLATE(BM_SumWithCondition, Decimal128);
BENCHMARK_TEMPLATE(BM_SumWithConditionNew, Decimal128);

BENCHMARK_TEMPLATE(BM_SumWithCondition, Decimal256);
BENCHMARK_TEMPLATE(BM_SumWithConditionNew, Decimal256);


/*
Run on (32 X 2100 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x16)
  L1 Instruction 32 KiB (x16)
  L2 Unified 1024 KiB (x16)
  L3 Unified 11264 KiB (x2)
Load Average: 7.56, 5.47, 5.16
-----------------------------------------------------------------------------
Benchmark                                   Time             CPU   Iterations
-----------------------------------------------------------------------------
BM_SumWithCondition<Int64>               8930 ns         8929 ns        77846
BM_SumWithConditionNew<Int64>            8768 ns         8767 ns        80325
BM_SumWithCondition<UInt64>              8816 ns         8816 ns        80369
BM_SumWithConditionNew<UInt64>           8725 ns         8724 ns        80186
BM_SumWithCondition<Float64>            10229 ns        10228 ns        68275
BM_SumWithConditionNew<Float64>         10213 ns        10212 ns        68308
BM_SumWithCondition<Int128>            444262 ns       444247 ns         1575
BM_SumWithConditionNew<Int128>          87837 ns        87834 ns         7991
BM_SumWithCondition<UInt128>           433537 ns       433518 ns         1615
BM_SumWithConditionNew<UInt128>         88010 ns        88008 ns         7955
BM_SumWithCondition<Int256>            659032 ns       658995 ns         1048
BM_SumWithConditionNew<Int256>         189202 ns       189195 ns         3713
BM_SumWithCondition<UInt256>           479715 ns       479695 ns         1457
BM_SumWithConditionNew<UInt256>        198451 ns       198447 ns         3696
BM_SumWithCondition<Decimal32>           4662 ns         4662 ns       150015
BM_SumWithConditionNew<Decimal32>        4670 ns         4669 ns       149746
BM_SumWithCondition<Decimal64>           8742 ns         8742 ns        80315
BM_SumWithConditionNew<Decimal64>        8943 ns         8943 ns        76422
BM_SumWithCondition<Decimal128>        445999 ns       445990 ns         1550
BM_SumWithConditionNew<Decimal128>      88954 ns        88952 ns         8002
BM_SumWithCondition<Decimal256>        515128 ns       515111 ns         1371
BM_SumWithConditionNew<Decimal256>     223425 ns       223420 ns         3184
*/

#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wbit-int-extension"
using NewInt128 = signed _BitInt(128);
using NewUInt128 = unsigned _BitInt(128);
using NewInt256 = signed _BitInt(256);
using NewUInt256 = unsigned _BitInt(256);
#pragma clang diagnostic pop

using OldInt128 = Int128;
using OldUInt128 = UInt128;
using OldInt256 = Int256;
using OldUInt256 = UInt256;

template <typename T>
static T generateRandomValue()
{
    T value;
    for (size_t i = 0; i < sizeof(T); ++i)
    {
        reinterpret_cast<uint8_t *>(&value)[i] = static_cast<uint8_t>(std::rand() % 256);
    }
    return value;
}


template <typename T>
static void BM_Addition(benchmark::State & state)
{
    T a = generateRandomValue<T>();
    T b = generateRandomValue<T>();
    for (auto _ : state)
    {
        T result = a + b;
        benchmark::DoNotOptimize(&result);
    }
}

template <typename T>
static void BM_Subtraction(benchmark::State & state)
{
    T a = generateRandomValue<T>();
    T b = generateRandomValue<T>();
    for (auto _ : state)
    {
        T result = a - b;
        benchmark::DoNotOptimize(&result);
    }
}

template <typename T>
static void BM_Multiplication(benchmark::State & state)
{
    T a = generateRandomValue<T>();
    T b = generateRandomValue<T>();
    for (auto _ : state)
    {
        T result = a * b;
        benchmark::DoNotOptimize(&result);
    }
}

template <typename T>
static void BM_Division(benchmark::State & state)
{
    T a = generateRandomValue<T>();
    T b = generateRandomValue<T>() + 1; // Avoid division by zero
    for (auto _ : state)
    {
        T result = a / b;
        benchmark::DoNotOptimize(&result);
    }
}
BENCHMARK_TEMPLATE(BM_Addition, OldInt128);
BENCHMARK_TEMPLATE(BM_Subtraction, OldInt128);
BENCHMARK_TEMPLATE(BM_Multiplication, OldInt128);
BENCHMARK_TEMPLATE(BM_Division, OldInt128);

BENCHMARK_TEMPLATE(BM_Addition, NewInt128);
BENCHMARK_TEMPLATE(BM_Subtraction, NewInt128);
BENCHMARK_TEMPLATE(BM_Multiplication, NewInt128);
BENCHMARK_TEMPLATE(BM_Division, NewInt128);

BENCHMARK_TEMPLATE(BM_Addition, OldUInt128);
BENCHMARK_TEMPLATE(BM_Subtraction, OldUInt128);
BENCHMARK_TEMPLATE(BM_Multiplication, OldUInt128);
BENCHMARK_TEMPLATE(BM_Division, OldUInt128);

BENCHMARK_TEMPLATE(BM_Addition, NewUInt128);
BENCHMARK_TEMPLATE(BM_Subtraction, NewUInt128);
BENCHMARK_TEMPLATE(BM_Multiplication, NewUInt128);
BENCHMARK_TEMPLATE(BM_Division, NewUInt128);

BENCHMARK_TEMPLATE(BM_Addition, OldInt256);
BENCHMARK_TEMPLATE(BM_Subtraction, OldInt256);
BENCHMARK_TEMPLATE(BM_Multiplication, OldInt256);
BENCHMARK_TEMPLATE(BM_Division, OldInt256);

BENCHMARK_TEMPLATE(BM_Addition, NewInt256);
BENCHMARK_TEMPLATE(BM_Subtraction, NewInt256);
BENCHMARK_TEMPLATE(BM_Multiplication, NewInt256);
BENCHMARK_TEMPLATE(BM_Division, NewInt256);

BENCHMARK_TEMPLATE(BM_Addition, OldUInt256);
BENCHMARK_TEMPLATE(BM_Subtraction, OldUInt256);
BENCHMARK_TEMPLATE(BM_Multiplication, OldUInt256);
BENCHMARK_TEMPLATE(BM_Division, OldUInt256);

BENCHMARK_TEMPLATE(BM_Addition, NewUInt256);
BENCHMARK_TEMPLATE(BM_Subtraction, NewUInt256);
BENCHMARK_TEMPLATE(BM_Multiplication, NewUInt256);
BENCHMARK_TEMPLATE(BM_Division, NewUInt256);

/*
Running ./build_gcc/utils/extern-local-engine/tests/benchmark_local_engine
Run on (32 X 2100 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x16)
  L1 Instruction 32 KiB (x16)
  L2 Unified 1024 KiB (x16)
  L3 Unified 11264 KiB (x2)
Load Average: 4.79, 5.12, 5.49

./build_gcc/utils/extern-local-engine/tests/benchmark_local_engine --benchmark_filter="(Addition|Subtraction|Multiplication|Division)<New.*>"
------------------------------------------------------------------------
Benchmark                              Time             CPU   Iterations
------------------------------------------------------------------------
BM_Addition<NewInt128>              1.43 ns         1.43 ns    488198747
BM_Subtraction<NewInt128>           1.51 ns         1.51 ns    486720421
BM_Multiplication<NewInt128>        1.52 ns         1.52 ns    450071487
BM_Division<NewInt128>              1.48 ns         1.48 ns    471973890
BM_Addition<NewUInt128>             1.46 ns         1.46 ns    480687874
BM_Subtraction<NewUInt128>          1.46 ns         1.46 ns    488204076
BM_Multiplication<NewUInt128>       1.45 ns         1.45 ns    468576127
BM_Division<NewUInt128>             1.48 ns         1.48 ns    477379447
BM_Addition<NewInt256>              2.49 ns         2.48 ns    291377319
BM_Subtraction<NewInt256>           2.52 ns         2.52 ns    284595240
BM_Multiplication<NewInt256>        2.48 ns         2.48 ns    276363723
BM_Division<NewInt256>              2.44 ns         2.44 ns    286877215
BM_Addition<NewUInt256>             2.53 ns         2.53 ns    266497385
BM_Subtraction<NewUInt256>          2.48 ns         2.48 ns    287899525
BM_Multiplication<NewUInt256>       2.45 ns         2.45 ns    287882140
BM_Division<NewUInt256>             2.47 ns         2.47 ns    288479037
*/

/*
./build_gcc/utils/extern-local-engine/tests/benchmark_local_engine --benchmark_filter="(Addition|Subtraction|Multiplication|Division)<Old.*>"
------------------------------------------------------------------------
Benchmark                              Time             CPU   Iterations
------------------------------------------------------------------------
BM_Addition<OldInt128>              1.45 ns         1.45 ns    484711423
BM_Subtraction<OldInt128>           1.45 ns         1.45 ns    475188736
BM_Multiplication<OldInt128>        1.47 ns         1.47 ns    483199322
BM_Division<OldInt128>              1.49 ns         1.49 ns    488830649
BM_Addition<OldUInt128>             1.45 ns         1.45 ns    487019006
BM_Subtraction<OldUInt128>          1.45 ns         1.45 ns    477626299
BM_Multiplication<OldUInt128>       1.47 ns         1.47 ns    475294481
BM_Division<OldUInt128>             1.48 ns         1.48 ns    461236815
BM_Addition<OldInt256>              4.39 ns         4.39 ns    159221253
BM_Subtraction<OldInt256>           5.01 ns         5.01 ns    100000000
BM_Multiplication<OldInt256>        11.3 ns         11.3 ns     54204439
BM_Division<OldInt256>              48.5 ns         48.5 ns     18505649
BM_Addition<OldUInt256>             4.37 ns         4.37 ns    180812154
BM_Subtraction<OldUInt256>          5.41 ns         5.41 ns    133516077
BM_Multiplication<OldUInt256>       2.47 ns         2.47 ns    286377591
BM_Division<OldUInt256>             21.8 ns         21.8 ns     25876643
*/