@@ -2521,7 +2521,7 @@ tf_kernel_library(
tf_kernel_library(
name = "lookup_table_op",
prefix = "lookup_table_op",
- deps = LOOKUP_DEPS,
+ deps = LOOKUP_DEPS+["//third_party/rec_base:rec_base_lib"],
)
cc_library(
@@ -3751,7 +3751,7 @@ tf_kernel_library(
tf_kernel_library(
name = "cwise_op",
prefix = "cwise_op",
- deps = MATH_DEPS,
+ deps = MATH_DEPS+["//third_party/rec_base:rec_base_lib"],
)
tf_kernel_library(
@@ -3795,7 +3795,7 @@ tf_kernel_library(
"//conditions:default": [],
}) + mkl_deps() + if_cuda([
"//tensorflow/core/platform/default/build_config:cublas_plugin",
- ]),
+ ]) + ["//third_party/rec_base:rec_base_lib"],
)
tf_mkl_kernel_library(
@@ -14,6 +14,7 @@ limitations under the License.
#include "tensorflow/core/kernels/cwise_ops_common.h"
+#include "third_party/rec_base/math/BinaryOp.h"
namespace tensorflow {
REGISTER2(BinaryOp, CPU, "FloorMod", functor::safe_floor_mod, int32, int64);
@@ -14,6 +14,7 @@ limitations under the License.
#include "tensorflow/core/kernels/cwise_ops_common.h"
+#include "third_party/rec_base/math/BinaryOp.h"
namespace tensorflow {
REGISTER9(BinaryOp, CPU, "Greater", functor::greater, float, Eigen::half,
@@ -14,6 +14,7 @@ limitations under the License.
#include "tensorflow/core/kernels/cwise_ops_common.h"
+#include "third_party/rec_base/math/BinaryOp.h"
namespace tensorflow {
REGISTER5(BinaryOp, CPU, "Less", functor::less, float, Eigen::half, double,
@@ -23,6 +23,8 @@ limitations under the License.
#include "tensorflow/core/framework/register_types.h"
#include "tensorflow/core/kernels/cwise_ops_common.h"
#include "tensorflow/core/platform/prefetch.h"
+#include "third_party/rec_base/include/select.h"
+#include "tensorflow/core/util/work_sharder.h"
namespace tensorflow {
@@ -143,6 +145,131 @@ class SelectOp : public OpKernel {
private:
TF_DISALLOW_COPY_AND_ASSIGN(SelectOp);
};
+
+template <typename Device> class SelectOp<Device, long long> : public OpKernel {
+ public:
+ using T = int64_t;
+ explicit SelectOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+ void Compute(OpKernelContext* ctx) override {
+ const Tensor* cond;
+ const Tensor* then;
+ const Tensor* else_;
+ OP_REQUIRES_OK(ctx, ctx->input("condition", &cond));
+ OP_REQUIRES_OK(ctx, ctx->input("t", &then));
+ OP_REQUIRES_OK(ctx, ctx->input("e", &else_));
+
+ if (TensorShapeUtils::IsScalar(cond->shape())) {
+ ComputeScalar(ctx, cond, then, else_);
+ return;
+ }
+
+ bool broadcasting = (TensorShapeUtils::IsVector(cond->shape()) &&
+ !TensorShapeUtils::IsVector(then->shape()));
+
+ if (broadcasting) {
+ ComputeBroadcasting(ctx, cond, then, else_);
+ return;
+ }
+ if (!ctx->ValidateInputsAreSameShape(this)) {
+ return;
+ }
+ Tensor *output = nullptr;
+ OP_REQUIRES_OK(ctx, ctx->forward_input_or_allocate_output({ "t", "e"}, "output", then->shape(), &output));
+ if (output->NumElements() <= 0) {
+ return;
+ }
+
+ int64_t *thenPtr = const_cast<int64_t *>(reinterpret_cast<const int64_t *>(then->flat<long long>().data()));
+ int64_t *elsePtr = const_cast<int64_t *>(reinterpret_cast<const int64_t *>(else_->flat<long long>().data()));
+ int64_t *outputPtr = const_cast<int64_t *>(reinterpret_cast<const int64_t *>(output->flat<long long>().data()));
+ bool *condPtr = const_cast<bool *>(reinterpret_cast<const bool *>(cond->flat<bool>().data()));
+
+ auto work = [&](int64_t start, int64_t end) {
+ ock::Select(condPtr + start, thenPtr + start, elsePtr + start, outputPtr + start, end - start);
+ };
+ auto *worker_threads = ctx->device()->tensorflow_cpu_worker_threads();
+ Shard(worker_threads->num_threads, worker_threads->workers, cond->flat<bool>().size(), 10, work) ;
+ }
+
+ protected:
+ void ComputeBroadcasting(OpKernelContext* ctx, const Tensor* cond,
+ const Tensor* then, const Tensor* else_) {
+ // Preliminary validation of sizes.
+ OP_REQUIRES(
+ ctx, TensorShapeUtils::IsVector(cond->shape()),
+ errors::InvalidArgument("'cond' must be a vector, but saw shape: ",
+ cond->shape().DebugString()));
+ OP_REQUIRES(
+ ctx,
+ FastBoundsCheck(cond->NumElements(),
+ std::numeric_limits<Eigen::DenseIndex>::max()),
+ errors::InvalidArgument("cond vector larger than ",
+ std::numeric_limits<Eigen::DenseIndex>::max()));
+ OP_REQUIRES(
+ ctx,
+ FastBoundsCheck(then->flat_outer_dims<long long>().dimension(1),
+ std::numeric_limits<Eigen::DenseIndex>::max()),
+ errors::InvalidArgument("flat outer dims dim 1 size >= ",
+ std::numeric_limits<Eigen::DenseIndex>::max()));
+
+ OP_REQUIRES(ctx, TensorShapeUtils::IsVectorOrHigher(then->shape()),
+ errors::InvalidArgument(
+ "'then' must be at least a vector, but saw shape: ",
+ then->shape().DebugString()));
+ OP_REQUIRES(
+ ctx, then->shape().dim_size(0) == cond->NumElements(),
+ errors::InvalidArgument(
+ "Number of batches of 'then' must match size of 'cond', but saw: ",
+ then->shape().dim_size(0), " vs. ", cond->NumElements()));
+ OP_REQUIRES(
+ ctx, then->shape().IsSameSize(else_->shape()),
+ errors::InvalidArgument(
+ "'then' and 'else' must have the same size. but received: ",
+ then->shape().DebugString(), " vs. ",
+ else_->shape().DebugString()));
+
+ Tensor* output = nullptr;
+ OP_REQUIRES_OK(ctx, ctx->forward_input_or_allocate_output(
+ {"t", "e"}, "output", then->shape(), &output));
+ if (output->NumElements() > 0) {
+ functor::BatchSelectFunctor<Device, long long> func;
+ func(ctx->eigen_device<Device>(), output->flat_outer_dims<long long>(),
+ cond->vec<bool>(), then->flat_outer_dims<long long>(),
+ else_->flat_outer_dims<long long>());
+ }
+ }
+
+ void ComputeElementwise(OpKernelContext* ctx, const Tensor* cond,
+ const Tensor* then, const Tensor* else_) {
+ if (!ctx->ValidateInputsAreSameShape(this)) return;
+ Tensor* output = nullptr;
+ OP_REQUIRES_OK(ctx, ctx->forward_input_or_allocate_output(
+ {"t", "e"}, "output", then->shape(), &output));
+ if (output->NumElements() > 0) {
+ functor::SelectFunctor<Device, long long> func;
+ func(ctx->eigen_device<Device>(), output->flat<long long>(), cond->flat<bool>(),
+ then->flat<long long>(), else_->flat<long long>());
+ }
+ }
+
+ void ComputeScalar(OpKernelContext* ctx, const Tensor* cond,
+ const Tensor* then, const Tensor* else_) {
+ OP_REQUIRES(
+ ctx, then->shape().IsSameSize(else_->shape()),
+ errors::InvalidArgument(
+ "'then' and 'else' must have the same size. but received: ",
+ then->shape().DebugString(), " vs. ",
+ else_->shape().DebugString()));
+
+ functor::SelectScalarHandler<Device, long long> handler;
+ handler(ctx, cond, then, else_);
+ }
+
+ private:
+ TF_DISALLOW_COPY_AND_ASSIGN(SelectOp);
+};
+
template <typename Device, typename T>
class SelectV2Op : public OpKernel {
public:
@@ -36,6 +36,10 @@ limitations under the License.
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/util/bcast.h"
+#include <chrono>
+#include <iostream>
+#include <string>
+
namespace tensorflow {
typedef Eigen::ThreadPoolDevice CPUDevice;
@@ -11,6 +11,7 @@ def tflite_copts():
"""Defines compile time flags."""
copts = [
"-DFARMHASH_NO_CXX_STRING",
+ "-flax-vector-conversions",
] + select({
str(Label("//tensorflow:android_arm64")): [
"-O3",
@@ -490,7 +490,7 @@ bool RegisterBfloat16Cast(int numpy_type, bool cast_is_safe) {
}
template <typename InType, typename OutType, typename Functor>
-void BinaryUFunc(char** args, npy_intp* dimensions, npy_intp* steps,
+void BinaryUFunc(char** args, npy_intp const* dimensions, npy_intp const* steps,
void* data) {
const char* i0 = args[0];
const char* i1 = args[1];
@@ -506,7 +506,7 @@ void BinaryUFunc(char** args, npy_intp* dimensions, npy_intp* steps,
}
template <typename Functor>
-void CompareUFunc(char** args, npy_intp* dimensions, npy_intp* steps,
+void CompareUFunc(char** args, npy_intp const* dimensions, npy_intp const* steps,
void* data) {
BinaryUFunc<bfloat16, npy_bool, Functor>(args, dimensions, steps, data);
}
new file mode 100644
@@ -0,0 +1,15 @@
+licenses(["notice"])
+cc_library(
+ name = "rec_base_lib",
+ srcs = ["lib/lib_rec_base.so"],
+ hdrs = [
+ "include/cmp.h",
+ "include/floor_mod.h",
+ "include/select.h",
+ "include/set_external_logger.h",
+ "math/BinaryOp.h",
+ "math/tools.h",
+ ],
+ includes = ["."],
+ visibility = ["//visibility:public"],
+)
\ No newline at end of file
new file mode 100644
@@ -0,0 +1,27 @@
+#ifndef RECBASE_CMP_H
+#define RECBASE_CMP_H
+
+#include <cstdlib>
+
+namespace ock {
+// Less
+template <typename T> int Less(T *input0, T *input1, bool *output, size_t length);
+
+// Less right
+template <typename T> int Less(T *input0, T input1, bool *output, size_t length);
+
+// Less left
+template <typename T> int Less(T input0, T *input1, bool *output, size_t length);
+
+// greater
+template <typename T> int Greater(T *input0, T *input1, bool *output, size_t length);
+
+// greater right
+template <typename T> int Greater(T *input0, T input1, bool *output, size_t length);
+
+// greater left
+template <typename T> int Greater(T input0, T *input1, bool *output, size_t length);
+} // namespace ock
+
+#endif // RECBASE_CMP_H
+
new file mode 100644
@@ -0,0 +1,11 @@
+#ifndef RECBASE_FLOOR_MOD_H
+#define RECBASE_FLOOR_MOD_H
+
+namespace ock {
+template <typename T> int FloorMod(T *input, T *mod, T *output, size_t length);
+
+template <typename T> int FloorMod(T input, T *mod, T *output, size_t length);
+
+template <typename T> int FloorMod(T *input, T mod, T *output, size_t length);
+}
+#endif // RECBASE_FLOOR_MOD_H
\ No newline at end of file
new file mode 100644
@@ -0,0 +1,9 @@
+#ifndef RECBASE_SELECT_H
+#define RECBASE_SELECT_H
+
+#include <cstdlib>
+
+namespace ock {
+template <typename T> int Select(bool *cond, T *thenBranch, T *elseBranch, T *output, size_t length);
+} // namespace ock
+#endif // RECBASE_SELECT_H
\ No newline at end of file
new file mode 100644
@@ -0,0 +1,17 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+using ExternalLog = void (*) (int level, const char *msg);
+
+#ifdef __cplusplus
+}
+#endif
+
+#ifndef RECBASE_SET_EXTERNAL_LOGGER_H
+#define RECBASE_SET_EXTERNAL_LOGGER_H
+namespace ock
+{
+int SetExternalLogFunc(ExternalLog logFunc);
+} // namespace ock
+#endif // RECBASE_SET_EXTERNAL_LOGGER_H
\ No newline at end of file
new file mode 100644
@@ -0,0 +1,136 @@
+#ifndef TF_BINARYOP_H
+#define TF_BINARYOP_H
+
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+#include "tensorflow/core/util/work_sharder.h"
+#include "tools.h"
+#include "third_party/rec_base/include/floor_mod.h"
+#include "third_party/rec_base/include/cmp.h"
+#include <chrono>
+#include <iostream>
+#include <cstdlib>
+#include <string>
+#include <stdexcept>
+using namespace ock;
+
+namespace tensorflow
+{
+#define REGISTER_BINARY_FUNC(tf_func, scalar_type, ock_in_type, ock_out_type, func) \
+ template <> class BinaryOp<CPUDevice, tf_func<scalar_type>> : public BinaryOpShared { \
+ public: \
+ typedef CPUDevice Device; \
+ typedef typename tf_func<scalar_type> Functor; \
+ typedef typename Functor::in_type Tin; \
+ typedef typename Functor::out_type Tout; \
+ explicit BinaryOp(OpKernelConstruction *ctx) \
+ : BinaryOpShared(ctx, DataTypeToEnum<Tout>::v(), DataTypeToEnum<Tin>::v()) {} \
+ void Compute(OpKernelContext* ctx) override { \
+ BinaryOpState state(ctx); \
+ auto& bcast = state.bcast; \
+ const Device& eigen_device = ctx->eigen_device<Device>(); \
+ auto &in0 = state.in0; \
+ auto &in1 = state.in1; \
+ Tensor* out = state.out; \
+ if (!bcast.IsValid()) { \
+ if (ctx->status().ok()) { \
+ if (state.result) { \
+ functor::SetOneFunctor<Device, bool>()(eigen_device, \
+ out->flat<bool>()); \
+ } else { \
+ functor::SetZeroFunctor<Device, bool>()(eigen_device, \
+ out->flat<bool>()); \
+ } \
+ } \
+ return; \
+ } \
+ if (state.out_num_elements == 0) { \
+ return; \
+ } \
+ auto input0 = in0.flat<Tin>(); \
+ auto input1 = in1.flat<Tin>(); \
+ auto output_flat = out->flat<Tout>(); \
+ ock_in_type *input0_data = \
+ const_cast<ock_in_type *> (reinterpret_cast<const ock_in_type *>(input0.data())); \
+ ock_in_type *input1_data = \
+ const_cast<ock_in_type *> (reinterpret_cast<const ock_in_type *>(input1.data())); \
+ ock_out_type *output_data = \
+ const_cast<ock_out_type *> (reinterpret_cast<const ock_out_type *>(output_flat.data())); \
+ const int ndims = state.ndims; \
+ bool error = false; \
+ bool* const error_ptr = Functor::has_errors ? &error : nullptr; \
+ const int PacketSize = Eigen::internal::packet_traits<Tin>::size; \
+ const int size = output_flat.size(); \
+ int n = size / PacketSize; \
+ const char* env_var = std::getenv("PER_COST"); \
+ int per_cost = 8; \
+ if (env_var != nullptr) { \
+ per_cost = std::stoi(env_var); \
+ } \
+ if (ndims <= 1) { \
+ if (state.in1_num_elements == 1 && state.in0_num_elements != 1) { \
+ auto work = [&](int64_t start, int64_t end) { \
+ func(input0_data + start, input1_data[0], output_data + start, end - start); \
+ }; \
+ auto *worker_threads = ctx->device()->tensorflow_cpu_worker_threads(); \
+ Shard(worker_threads->num_threads, worker_threads->workers, size, \
+ per_cost, work); \
+ } else if (state.in0_num_elements == 1 && state.in1_num_elements != 1) { \
+ auto work = [&](int64_t start, int64_t end) { \
+ func(input0_data[0], input1_data + start, output_data + start, end - start); \
+ }; \
+ auto *worker_threads = ctx->device()->tensorflow_cpu_worker_threads(); \
+ Shard(worker_threads->num_threads, worker_threads->workers, size, \
+ per_cost, work); \
+ } else { \
+ auto work = [&](int64_t start, int64_t end) { \
+ func(input0_data + start, input1_data + start, output_data + start, end - start); \
+ }; \
+ auto *worker_threads = ctx->device()->tensorflow_cpu_worker_threads(); \
+ Shard(worker_threads->num_threads, worker_threads->workers, size, \
+ per_cost, work); \
+ } \
+ } else if (ndims == 2) { \
+ functor::BinaryFunctor<Device, Functor, 2>().BCast( \
+ eigen_device, out->shaped<Tout, 2>(bcast.result_shape()), \
+ in0.template shaped<Tin, 2>(bcast.x_reshape()), \
+ BCast::ToIndexArray<2>(bcast.x_bcast()), \
+ in1.template shaped<Tin, 2>(bcast.y_reshape()), \
+ BCast::ToIndexArray<2>(bcast.y_bcast()), error_ptr); \
+ } else if (ndims == 3) { \
+ functor::BinaryFunctor<Device, Functor, 3>().BCast( \
+ eigen_device, out->shaped<Tout, 3>(bcast.result_shape()), \
+ in0.template shaped<Tin, 3>(bcast.x_reshape()), \
+ BCast::ToIndexArray<3>(bcast.x_bcast()), \
+ in1.template shaped<Tin, 3>(bcast.y_reshape()), \
+ BCast::ToIndexArray<3>(bcast.y_bcast()), error_ptr); \
+ } else if (ndims == 4) { \
+ functor::BinaryFunctor<Device, Functor, 4>().BCast( \
+ eigen_device, out->shaped<Tout, 4>(bcast.result_shape()), \
+ in0.template shaped<Tin, 4>(bcast.x_reshape()), \
+ BCast::ToIndexArray<4>(bcast.x_bcast()), \
+ in1.template shaped<Tin, 4>(bcast.y_reshape()), \
+ BCast::ToIndexArray<4>(bcast.y_bcast()), error_ptr); \
+ } else if (ndims == 5) { \
+ functor::BinaryFunctor<Device, Functor, 5>().BCast( \
+ eigen_device, out->shaped<Tout, 5>(bcast.result_shape()), \
+ in0.template shaped<Tin, 5>(bcast.x_reshape()), \
+ BCast::ToIndexArray<5>(bcast.x_bcast()), \
+ in1.template shaped<Tin, 5>(bcast.y_reshape()), \
+ BCast::ToIndexArray<5>(bcast.y_bcast()), error_ptr); \
+ } else { \
+ SetUnimplementedError(ctx); \
+ } \
+ if (Functor::has_errors && error) { \
+ SetComputeError(ctx); \
+ } \
+ } \
+ };
+
+REGISTER_BINARY_FUNC(functor::floor_fmod, float, float, float, FloorMod);
+REGISTER_BINARY_FUNC(functor::floor_fmod, double, double, double, FloorMod);
+REGISTER_BINARY_FUNC(functor::less, int64, int64_t, bool, Less);
+REGISTER_BINARY_FUNC(functor::less, int32, int32_t, bool, Less);
+REGISTER_BINARY_FUNC(functor::greater, int64, int64_t, bool, Greater);
+REGISTER_BINARY_FUNC(functor::greater, int32, int32_t, bool, Greater);
+} // namespace tensorflow
+#endif // TF_BINARYOP_H
\ No newline at end of file
new file mode 100644
@@ -0,0 +1,23 @@
+#ifndef KUNPENG_TOOLS_H
+#define KUNPENG_TOOLS_H
+
+#include "third_party/eigen3/Eigen/Core"
+
+namespace Eigen {
+namespace internal {
+template <>
+EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d &a)
+{
+ const Packet2d cst_1 = pset1<Packet2d>(1.0);
+ /* perform a floorf */
+ const Packet2d tmp = vcvtq_f64_s64(vcvtq_s64_f64(a));
+
+ /* if greater, substract 1 */
+ uint64x2_t mask = vcgtq_f64(tmp, a);
+ mask = vandq_u64(mask, vreinterpretq_u64_f64(cst_1));
+ return vsubq_f64(tmp, vreinterpretq_f64_u64(mask));
+}
+} // namespace internal
+} // namespace Eigen
+
+#endif // KUNPENG_TOOLS_H
\ No newline at end of file
--