From 1767fee69881ab53a071d5e8fd07cafc00a3f7a0 Mon Sep 17 00:00:00 2001
From: wejoncy <wejoncy@163.com>
Date: Tue, 10 Sep 2024 17:03:22 +0800
Subject: [PATCH 1/6] support coremlfp16

---
 .../coreml/builders/impl/base_op_builder.cc   | 24 +++++++++++++++----
 .../coreml/builders/impl/base_op_builder.h    |  2 +-
 .../coreml/builders/impl/binary_op_builder.cc |  4 ++--
 .../coreml/builders/impl/builder_utils.cc     | 16 +++++++++++++
 .../coreml/builders/impl/builder_utils.h      |  3 +++
 .../coreml/builders/model_builder.cc          |  3 +++
 .../core/providers/coreml/model/model.mm      | 16 +++++++++++++
 7 files changed, 60 insertions(+), 8 deletions(-)
diff --git a/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc
index 2cae85a0a1c8..9de6e2c20c97 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc
@@ -1,6 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include <set>
 #include "core/providers/common.h"
 #include "core/providers/coreml/builders/helper.h"
 #include "core/providers/coreml/builders/impl/base_op_builder.h"
@@ -12,6 +13,10 @@ using namespace CoreML::Specification;
 namespace onnxruntime {
 namespace coreml {
 
+static std::set<const std::string> Float16Ops = {
+  "Add",
+};
+
 namespace {
 // TODO, move this to shared_library
 bool HasExternalInitializer(const InitializedTensorSet& initializers, const Node& node,
@@ -83,7 +88,7 @@ bool BaseOpBuilder::HasSupportedInputs(const Node& node, const OpBuilderInputPar
 }
 
 /* static */
-bool BaseOpBuilder::IsInputFloat(const Node& node, size_t idx, const OpBuilderInputParams& /*input_params*/,
+bool BaseOpBuilder::IsInputDtypeSupport(const Node& node, size_t idx, const OpBuilderInputParams& /*input_params*/,
                                  const logging::Logger& logger) {
   if (idx >= node.InputDefs().size()) {
     LOGS(logger, VERBOSE) << "Input index [" << idx << "] is out of range";
@@ -94,12 +99,21 @@ bool BaseOpBuilder::IsInputFloat(const Node& node, size_t idx, const OpBuilderIn
 
   int32_t input_type = ONNX_NAMESPACE::TensorProto_DataType_UNDEFINED;
 
-  // currently only float is supported
-  if (!GetType(input, input_type, logger) || input_type != ONNX_NAMESPACE::TensorProto_DataType_FLOAT) {
-    LOGS(logger, VERBOSE) << "[" << node.OpType() << "] Input type: [" << input_type << "] is not currently supported";
+  if (!GetType(input, input_type, logger)) {
+    LOGS(logger, VERBOSE) << "[" << node.OpType() << "] Get Input type failed";
     return false;
   }
 
+  // float is supported
+  if (input_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT){
+    return true;
+  }
+
+  if (input_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16 && Float16Ops.count(node.OpType())) {
+    return true;
+  }
+
+  LOGS(logger, VERBOSE) << "[" << node.OpType() << "] Input type: [" << input_type << "] is not currently supported";
   return true;
 }
 
@@ -107,7 +121,7 @@ bool BaseOpBuilder::HasSupportedInputsImpl(const Node& node, const OpBuilderInpu
                                            const logging::Logger& logger) const {
   // We only check the type of input 0 by default
   // specific op builder can override this
-  return IsInputFloat(node, 0, input_params, logger);
+  return IsInputDtypeSupport(node, 0, input_params, logger);
 }
 
 bool BaseOpBuilder::HasSupportedOpSet(const Node& node, const logging::Logger& logger) const {
diff --git a/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.h b/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.h
index 071008520fbd..6bd3c43f373c 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.h
+++ b/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.h
@@ -33,7 +33,7 @@ class BaseOpBuilder : public IOpBuilder {
   }
 
   // currently we only support float
-  static bool IsInputFloat(const Node& node, size_t idx, const OpBuilderInputParams& input_params,
+  static bool IsInputDtypeSupport(const Node& node, size_t idx, const OpBuilderInputParams& input_params,
                            const logging::Logger& logger);
 
  private:
diff --git a/onnxruntime/core/providers/coreml/builders/impl/binary_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/binary_op_builder.cc
index fb8e07633621..3ecea9c3770f 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/binary_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/binary_op_builder.cc
@@ -139,8 +139,8 @@ bool BinaryOpBuilder::HasSupportedInputsImpl(const Node& node, const OpBuilderIn
   // Add/Sub/Mul/Div spec says inputs must be of the same type.
   // Pow spec says inputs can be different types.
   // We only support float for all of these inputs.
-  if (!IsInputFloat(node, 0, input_params, logger) ||
-      ((node.OpType() == "Pow") && !IsInputFloat(node, 1, input_params, logger))) {
+  if (!IsInputDtypeSupport(node, 0, input_params, logger) ||
+      ((node.OpType() == "Pow") && !IsInputDtypeSupport(node, 1, input_params, logger))) {
     return false;
   }
 
diff --git a/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc b/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc
index e02186d3aee8..328f8b327992 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc
@@ -96,6 +96,9 @@ Status CreateCoreMLWeight(CoreML::Specification::WeightParams& weight,
     case ONNX_NAMESPACE::TensorProto_DataType_FLOAT:
       CreateCoreMLWeight(weight, unpacked_tensor.DataAsSpan<float>());
       break;
+    case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:
+      CreateCoreMLWeight(weight, unpacked_tensor.DataAsSpan<MLFloat16>());
+      break;
     case ONNX_NAMESPACE::TensorProto_DataType_INT32:
       CreateCoreMLWeight(weight, unpacked_tensor.DataAsSpan<int32_t>());
       break;
@@ -114,6 +117,11 @@ void CreateCoreMLWeight(CoreML::Specification::WeightParams& weight, gsl::span<c
   weight.mutable_floatvalue()->Assign(data.begin(), data.end());
 }
 
+void CreateCoreMLWeight(CoreML::Specification::WeightParams& weight, gsl::span<const MLFloat16> data) {
+  const char* data_byte_ptr = (const char*)(data.data());
+  weight.mutable_float16value()->assign(data_byte_ptr, data_byte_ptr+data.size_bytes());
+}
+
 namespace {
 template <typename T>
 void CreateCoreMLWeightConvertingDataToFloats(CoreML::Specification::WeightParams& weight, gsl::span<const T> data) {
@@ -133,6 +141,8 @@ void CreateCoreMLWeight(CoreML::Specification::WeightParams& weight, gsl::span<c
   CreateCoreMLWeightConvertingDataToFloats(weight, data);
 }
 
+
+
 #if defined(COREML_ENABLE_MLPROGRAM)
 //
 // ML Program Utils
@@ -290,6 +300,12 @@ MILSpec::Value CreateScalarTensorValue(const T& data) {
 // explicit specializations for types we handle so the implementation can be in the .cc file
 template MILSpec::Value CreateTensorValue<int64_t, int32_t>(gsl::span<const int64_t> data,
                                                             std::optional<gsl::span<const int64_t>> shape);
+template MILSpec::Value CreateTensorValue<float, float>(gsl::span<const float> data,
+                                                            std::optional<gsl::span<const int64_t>> shape);
+template MILSpec::Value CreateTensorValue<bool, bool>(gsl::span<const bool> data,
+                                                            std::optional<gsl::span<const int64_t>> shape);
+template MILSpec::Value CreateTensorValue<std::string, std::string>(gsl::span<const std::string> data,
+                                                            std::optional<gsl::span<const int64_t>> shape);
 
 template MILSpec::Value CreateScalarTensorValue(const float& data);
 template MILSpec::Value CreateScalarTensorValue(const int32_t& data);
diff --git a/onnxruntime/core/providers/coreml/builders/impl/builder_utils.h b/onnxruntime/core/providers/coreml/builders/impl/builder_utils.h
index 475ce79b0a81..f25936e25a17 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/builder_utils.h
+++ b/onnxruntime/core/providers/coreml/builders/impl/builder_utils.h
@@ -41,6 +41,9 @@ Status CreateCoreMLWeight(CoreML::Specification::WeightParams& weight, const ONN
 // Copy the float array to a coreml weight
 void CreateCoreMLWeight(CoreML::Specification::WeightParams& weight, gsl::span<const float> data);
 
+// Copy the float array to a coreml weight
+void CreateCoreMLWeight(CoreML::Specification::WeightParams& weight, gsl::span<const MLFloat16> data);
+
 // Copy the int32_t array to a coreml weight
 void CreateCoreMLWeight(CoreML::Specification::WeightParams& weight, gsl::span<const int32_t> data);
 
diff --git a/onnxruntime/core/providers/coreml/builders/model_builder.cc b/onnxruntime/core/providers/coreml/builders/model_builder.cc
index 9668bfcd09ad..7ecfad8493ea 100644
--- a/onnxruntime/core/providers/coreml/builders/model_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/model_builder.cc
@@ -811,6 +811,9 @@ Status ModelBuilder::RegisterModelInputOutput(const NodeArg& node_arg, bool is_i
       case ONNX_NAMESPACE::TensorProto_DataType_FLOAT:
         multi_array->set_datatype(ArrayFeatureType::FLOAT32);
         break;
+      case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:
+        multi_array->set_datatype(ArrayFeatureType::FLOAT16);
+        break;
       case ONNX_NAMESPACE::TensorProto_DataType_INT32:
         multi_array->set_datatype(ArrayFeatureType::INT32);
         break;
diff --git a/onnxruntime/core/providers/coreml/model/model.mm b/onnxruntime/core/providers/coreml/model/model.mm
index 68460ff7c9b3..60c93aa60162 100644
--- a/onnxruntime/core/providers/coreml/model/model.mm
+++ b/onnxruntime/core/providers/coreml/model/model.mm
@@ -120,6 +120,10 @@ Status CreateInputFeatureProvider(const std::unordered_map<std::string, OnnxTens
         data_type = MLMultiArrayDataTypeFloat32;
         break;
       }
+      case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16: {
+        data_type = MLMultiArrayDataTypeFloat16;
+        break;
+      }
       case ONNX_NAMESPACE::TensorProto_DataType_INT32: {
         data_type = MLMultiArrayDataTypeInt32;
         break;
@@ -205,6 +209,18 @@ Status CopyMLMultiArrayBuffer(const void* mlmultiarray_buffer, void* tensor_buff
       }
       break;
     }
+    case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16: {
+      const auto* src_buffer = static_cast<const uint16_t*>(mlmultiarray_buffer);
+      auto* dst_buffer = static_cast<uint16_t*>(tensor_buffer);
+      const auto block_byte_size = block_size * sizeof(uint16_t);
+
+      for (int64_t idx = 0; idx < num_blocks; ++idx) {
+        memcpy(dst_buffer, src_buffer, block_byte_size);
+        src_buffer += stride;
+        dst_buffer += block_size;
+      }
+      break;
+    }
     case ONNX_NAMESPACE::TensorProto_DataType_INT32: {
       const auto* src_buffer = static_cast<const int32_t*>(mlmultiarray_buffer);
       auto* dst_buffer = static_cast<int32_t*>(tensor_buffer);

From bb9900882c08a16a16cb27e2555fd1b3c29bebbf Mon Sep 17 00:00:00 2001
From: wejoncy <wejoncy@163.com>
Date: Wed, 11 Sep 2024 03:49:00 -0700
Subject: [PATCH 2/6] support unary and binary ops

---
 .../coreml/builders/impl/base_op_builder.cc   | 11 +--
 .../coreml/builders/impl/binary_op_builder.cc |  4 +-
 .../coreml/builders/impl/unary_op_builder.cc  | 32 +++++++
 .../providers/coreml/coreml_basic_test.cc     | 87 +++++++++++++++++++
 onnxruntime/test/util/test_utils.cc           |  5 ++
 5 files changed, 132 insertions(+), 7 deletions(-)

diff --git a/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc
index 9de6e2c20c97..cc6f2d796c5e 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc
@@ -14,7 +14,7 @@ namespace onnxruntime {
 namespace coreml {
 
 static std::set<const std::string> Float16Ops = {
-  "Add",
+  "Add", "Mul", "Sub", "Div", "Pow", "Sqrt", "Reciprocal"
 };
 
 namespace {
@@ -88,7 +88,7 @@ bool BaseOpBuilder::HasSupportedInputs(const Node& node, const OpBuilderInputPar
 }
 
 /* static */
-bool BaseOpBuilder::IsInputDtypeSupport(const Node& node, size_t idx, const OpBuilderInputParams& /*input_params*/,
+bool BaseOpBuilder::IsInputDtypeSupport(const Node& node, size_t idx, const OpBuilderInputParams& input_params,
                                  const logging::Logger& logger) {
   if (idx >= node.InputDefs().size()) {
     LOGS(logger, VERBOSE) << "Input index [" << idx << "] is out of range";
@@ -109,12 +109,13 @@ bool BaseOpBuilder::IsInputDtypeSupport(const Node& node, size_t idx, const OpBu
     return true;
   }
 
-  if (input_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16 && Float16Ops.count(node.OpType())) {
+#if defined(COREML_ENABLE_MLPROGRAM)
+  if (input_params.create_mlprogram && input_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16 && Float16Ops.count(node.OpType())) {
     return true;
   }
-
+#endif
   LOGS(logger, VERBOSE) << "[" << node.OpType() << "] Input type: [" << input_type << "] is not currently supported";
-  return true;
+  return false;
 }
 
 bool BaseOpBuilder::HasSupportedInputsImpl(const Node& node, const OpBuilderInputParams& input_params,
diff --git a/onnxruntime/core/providers/coreml/builders/impl/binary_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/binary_op_builder.cc
index 3ecea9c3770f..bc1eed8c1920 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/binary_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/binary_op_builder.cc
@@ -73,7 +73,7 @@ Status BinaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
     } else if (op_type == "Sub") {
       coreml_op_type = "sub";
     } else if (op_type == "Div") {
-      // we only support fp32 currently. when we add support for integers we need to check the type and use
+      // we support fp32/fp16 currently. when we add support for integers we need to check the type and use
       // "floor_div" or "real_div" accordingly
       coreml_op_type = "real_div";
     } else if (op_type == "Pow") {
@@ -138,7 +138,7 @@ bool BinaryOpBuilder::HasSupportedInputsImpl(const Node& node, const OpBuilderIn
                                              const logging::Logger& logger) const {
   // Add/Sub/Mul/Div spec says inputs must be of the same type.
   // Pow spec says inputs can be different types.
-  // We only support float for all of these inputs.
+  // We support float/float16 for all of these inputs.
   if (!IsInputDtypeSupport(node, 0, input_params, logger) ||
       ((node.OpType() == "Pow") && !IsInputDtypeSupport(node, 1, input_params, logger))) {
     return false;
diff --git a/onnxruntime/core/providers/coreml/builders/impl/unary_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/unary_op_builder.cc
index 3403378d5911..595e08d1d771 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/unary_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/unary_op_builder.cc
@@ -3,6 +3,7 @@
 
 #include "core/providers/common.h"
 
+#include "core/providers/coreml/builders/impl/builder_utils.h"
 #include "core/providers/coreml/builders/helper.h"
 #include "core/providers/coreml/builders/impl/base_op_builder.h"
 #include "core/providers/coreml/builders/model_builder.h"
@@ -14,6 +15,7 @@ namespace coreml {
 class UnaryOpBuilder : public BaseOpBuilder {
   Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
                                const logging::Logger& logger) const override;
+  bool SupportsMLProgram() const override { return true; }
 };
 
 Status UnaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
@@ -21,6 +23,35 @@ Status UnaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
   const auto& op_type(node.OpType());
   const auto& input_defs(node.InputDefs());
 
+
+#if defined(COREML_ENABLE_MLPROGRAM)
+  if (model_builder.CreateMLProgram()) {
+    using namespace CoreML::Specification::MILSpec;
+
+    // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#module-coremltools.converters.mil.mil.ops.defs.iOS15.elementwise_binary
+    std::string_view coreml_op_type;
+    if (op_type == "Sqrt") {
+      coreml_op_type = "sqrt";
+    } else if (op_type == "Reciprocal") {
+      coreml_op_type = "inverse";
+    } else {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "UnaryOpBuilder::AddToModelBuilderImpl, unexpected op: ", op_type);
+    }
+
+    std::unique_ptr<Operation> op = model_builder.CreateOperation(node, coreml_op_type);
+    AddOperationInput(*op, "x", input_defs[0]->Name());
+    if (op_type == "Reciprocal") {
+      float epsilon = 1e-4; //epsilon: const T (Optional, default=1e-4)
+      AddOperationInput(*op, "epsilon", model_builder.AddScalarConstant(op->type(), "epsilon", epsilon));
+    }
+
+    AddOperationOutput(*op, *node.OutputDefs()[0]);
+
+    model_builder.AddOperation(std::move(op));
+  } else
+#endif  // defined (COREML_ENABLE_MLPROGRAM)
+  {
   std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
 
   if (op_type == "Sqrt") {
@@ -36,6 +67,7 @@ Status UnaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
   *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();
 
   model_builder.AddLayer(std::move(layer));
+  }
   return Status::OK();
 }
 
diff --git a/onnxruntime/test/providers/coreml/coreml_basic_test.cc b/onnxruntime/test/providers/coreml/coreml_basic_test.cc
index daa24db13411..c9d8a605678b 100644
--- a/onnxruntime/test/providers/coreml/coreml_basic_test.cc
+++ b/onnxruntime/test/providers/coreml/coreml_basic_test.cc
@@ -257,6 +257,93 @@ TEST(CoreMLExecutionProviderTest, TestNameSanitization) {
   // TensorRT does not support Clip opset 11 yet.
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
+
+TEST(CoreMLExecutionProviderTest, TestBinaryFp16) {
+  auto test_binary_op = [](std::string op){
+    OpTester test(op, 11);
+
+    std::vector<int64_t> dims{3, 3};
+    std::vector<float> input1 = {-1.0f, 0.0f, 1.0f,
+                          -6.0f, 0.0f, 6.0f,
+                          -5.4f, 2.0f, 6.0f};
+    std::vector<MLFloat16> input1_fp16(9);
+    ConvertFloatToMLFloat16(input1.data(), input1_fp16.data(), 9);
+    std::vector<float> input2 = {-1.0f, 0.0f, 1.0f,
+                          -5.0f, 0.0f, 5.0f,
+                          -5.0f, 2.0f, 5.0f};
+    std::vector<MLFloat16> input2_fp16(9);
+    ConvertFloatToMLFloat16(input2.data(), input2_fp16.data(), 9);
+    std::vector<float> output(9);
+    if (op == "Add"){
+      for(int i = 0; i < 9; i++){
+        output[i] = input1_fp16[i] + input2_fp16[i];
+      }
+    } else if (op == "Sub") {
+      for(int i = 0; i < 9; i++){
+        output[i] = input1_fp16[i] - input2_fp16[i];
+      }
+    } else if (op == "Mul") {
+      for(int i = 0; i < 9; i++){
+        output[i] = input1_fp16[i] * input2_fp16[i];
+      }
+    } else if (op == "Div") {
+      for(int i = 0; i < 9; i++){
+        output[i] = input1_fp16[i] / input2_fp16[i];
+      }
+    }
+    std::vector<MLFloat16> output_fp16(9);
+    ConvertFloatToMLFloat16(output.data(), output_fp16.data(), 9);
+
+    test.AddInput<MLFloat16>("0", dims, input1_fp16);
+    test.AddInput<MLFloat16>("1.min", dims, input2_fp16);
+    test.AddOutput<MLFloat16>("3", dims, output_fp16);
+
+    // TensorRT does not support Clip opset 11 yet.
+    std::vector<std::unique_ptr<IExecutionProvider>> coreml_ep;
+    coreml_ep.emplace_back(MakeCoreMLExecutionProvider(COREML_FLAG_CREATE_MLPROGRAM));
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &coreml_ep);
+  };
+  test_binary_op("Add");
+  test_binary_op("Sub");
+  test_binary_op("Div");
+  test_binary_op("Mul");
+}
+
+TEST(CoreMLExecutionProviderTest, TestUnaryFp16) {
+  auto test_binary_op = [](std::string op){
+    OpTester test(op, 11);
+
+    std::vector<int64_t> dims{3, 3};
+    std::vector<float> input1 = {-1.0f, 0.0f, 1.0f,
+                          -6.0f, 0.2f, 6.0f,
+                          -5.4f, 2.0f, 6.0f};
+    std::vector<MLFloat16> input1_fp16(9);
+    ConvertFloatToMLFloat16(input1.data(), input1_fp16.data(), 9);
+
+    std::vector<float> output(9);
+    if (op == "Sqrt"){
+      for(int i = 0; i < 9; i++){
+        output[i] = sqrt(input1_fp16[i]);
+      }
+    } else if (op == "Reciprocal") {
+      for(int i = 0; i < 9; i++){
+        output[i] = 1.0f/(1e-4+input1_fp16[i]);
+      }
+    }
+    std::vector<MLFloat16> output_fp16(9);
+    ConvertFloatToMLFloat16(output.data(), output_fp16.data(), 9);
+
+    test.AddInput<MLFloat16>("0", dims, input1_fp16);
+    test.AddOutput<MLFloat16>("3", dims, output_fp16);
+
+    // TensorRT does not support Clip opset 11 yet.
+    std::vector<std::unique_ptr<IExecutionProvider>> coreml_ep;
+    coreml_ep.emplace_back(MakeCoreMLExecutionProvider(COREML_FLAG_CREATE_MLPROGRAM));
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &coreml_ep);
+  };
+  test_binary_op("Sqrt");
+  test_binary_op("Reciprocal");
+}
 #endif
 
 }  // namespace test
diff --git a/onnxruntime/test/util/test_utils.cc b/onnxruntime/test/util/test_utils.cc
index 6bc0f8d10549..606b8d580fa3 100644
--- a/onnxruntime/test/util/test_utils.cc
+++ b/onnxruntime/test/util/test_utils.cc
@@ -55,6 +55,11 @@ void VerifyOutput(const std::string& output_name,
                   ::testing::Pointwise(::testing::FloatNear(fp32_abs_err), tensor.DataAsSpan<float>()));
       break;
     }
+    case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16: {
+      EXPECT_THAT(expected_tensor.DataAsSpan<MLFloat16>(),
+                  ::testing::Pointwise(::testing::FloatNear(fp32_abs_err), tensor.DataAsSpan<MLFloat16>()));
+      break;
+    }
     default:
       ORT_THROW("Unhandled data type. Please add 'case' statement for ", element_type);
   }

From 4e866d1dbab6dc41af3db414b79fc2523c20ca46 Mon Sep 17 00:00:00 2001
From: wejoncy <wejoncy@163.com>
Date: Wed, 11 Sep 2024 03:57:27 -0700
Subject: [PATCH 3/6] format

---
 .../coreml/builders/impl/base_op_builder.cc   |  7 ++--
 .../coreml/builders/impl/base_op_builder.h    |  2 +-
 .../coreml/builders/impl/builder_utils.cc     | 10 +++---
 .../coreml/builders/impl/unary_op_builder.cc  | 27 +++++++--------
 .../providers/coreml/coreml_basic_test.cc     | 34 +++++++++----------
 5 files changed, 38 insertions(+), 42 deletions(-)

diff --git a/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc
index cc6f2d796c5e..a261dbb63d07 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc
@@ -14,8 +14,7 @@ namespace onnxruntime {
 namespace coreml {
 
 static std::set<const std::string> Float16Ops = {
-  "Add", "Mul", "Sub", "Div", "Pow", "Sqrt", "Reciprocal"
-};
+    "Add", "Mul", "Sub", "Div", "Pow", "Sqrt", "Reciprocal"};
 
 namespace {
 // TODO, move this to shared_library
@@ -89,7 +88,7 @@ bool BaseOpBuilder::HasSupportedInputs(const Node& node, const OpBuilderInputPar
 
 /* static */
 bool BaseOpBuilder::IsInputDtypeSupport(const Node& node, size_t idx, const OpBuilderInputParams& input_params,
-                                 const logging::Logger& logger) {
+                                        const logging::Logger& logger) {
   if (idx >= node.InputDefs().size()) {
     LOGS(logger, VERBOSE) << "Input index [" << idx << "] is out of range";
     return false;
@@ -105,7 +104,7 @@ bool BaseOpBuilder::IsInputDtypeSupport(const Node& node, size_t idx, const OpBu
   }
 
   // float is supported
-  if (input_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT){
+  if (input_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT) {
     return true;
   }
 
diff --git a/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.h b/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.h
index 6bd3c43f373c..a2cbef6dd57d 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.h
+++ b/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.h
@@ -34,7 +34,7 @@ class BaseOpBuilder : public IOpBuilder {
 
   // currently we only support float
   static bool IsInputDtypeSupport(const Node& node, size_t idx, const OpBuilderInputParams& input_params,
-                           const logging::Logger& logger);
+                                  const logging::Logger& logger);
 
  private:
   virtual bool IsOpSupportedImpl(const Node& /*node*/, const OpBuilderInputParams& /*input_params*/,
diff --git a/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc b/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc
index 328f8b327992..fc6b5792f364 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc
@@ -119,7 +119,7 @@ void CreateCoreMLWeight(CoreML::Specification::WeightParams& weight, gsl::span<c
 
 void CreateCoreMLWeight(CoreML::Specification::WeightParams& weight, gsl::span<const MLFloat16> data) {
   const char* data_byte_ptr = (const char*)(data.data());
-  weight.mutable_float16value()->assign(data_byte_ptr, data_byte_ptr+data.size_bytes());
+  weight.mutable_float16value()->assign(data_byte_ptr, data_byte_ptr + data.size_bytes());
 }
 
 namespace {
@@ -141,8 +141,6 @@ void CreateCoreMLWeight(CoreML::Specification::WeightParams& weight, gsl::span<c
   CreateCoreMLWeightConvertingDataToFloats(weight, data);
 }
 
-
-
 #if defined(COREML_ENABLE_MLPROGRAM)
 //
 // ML Program Utils
@@ -301,11 +299,11 @@ MILSpec::Value CreateScalarTensorValue(const T& data) {
 template MILSpec::Value CreateTensorValue<int64_t, int32_t>(gsl::span<const int64_t> data,
                                                             std::optional<gsl::span<const int64_t>> shape);
 template MILSpec::Value CreateTensorValue<float, float>(gsl::span<const float> data,
-                                                            std::optional<gsl::span<const int64_t>> shape);
+                                                        std::optional<gsl::span<const int64_t>> shape);
 template MILSpec::Value CreateTensorValue<bool, bool>(gsl::span<const bool> data,
-                                                            std::optional<gsl::span<const int64_t>> shape);
+                                                      std::optional<gsl::span<const int64_t>> shape);
 template MILSpec::Value CreateTensorValue<std::string, std::string>(gsl::span<const std::string> data,
-                                                            std::optional<gsl::span<const int64_t>> shape);
+                                                                    std::optional<gsl::span<const int64_t>> shape);
 
 template MILSpec::Value CreateScalarTensorValue(const float& data);
 template MILSpec::Value CreateScalarTensorValue(const int32_t& data);
diff --git a/onnxruntime/core/providers/coreml/builders/impl/unary_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/unary_op_builder.cc
index 595e08d1d771..6d46c3789dec 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/unary_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/unary_op_builder.cc
@@ -23,7 +23,6 @@ Status UnaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
   const auto& op_type(node.OpType());
   const auto& input_defs(node.InputDefs());
 
-
 #if defined(COREML_ENABLE_MLPROGRAM)
   if (model_builder.CreateMLProgram()) {
     using namespace CoreML::Specification::MILSpec;
@@ -42,7 +41,7 @@ Status UnaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
     std::unique_ptr<Operation> op = model_builder.CreateOperation(node, coreml_op_type);
     AddOperationInput(*op, "x", input_defs[0]->Name());
     if (op_type == "Reciprocal") {
-      float epsilon = 1e-4; //epsilon: const T (Optional, default=1e-4)
+      float epsilon = 1e-4;  // epsilon: const T (Optional, default=1e-4)
       AddOperationInput(*op, "epsilon", model_builder.AddScalarConstant(op->type(), "epsilon", epsilon));
     }
 
@@ -52,21 +51,21 @@ Status UnaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
   } else
 #endif  // defined (COREML_ENABLE_MLPROGRAM)
   {
-  std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
+    std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
 
-  if (op_type == "Sqrt") {
-    layer->mutable_unary()->set_type(COREML_SPEC::UnaryFunctionLayerParams::SQRT);
-  } else if (op_type == "Reciprocal") {
-    layer->mutable_unary()->set_type(COREML_SPEC::UnaryFunctionLayerParams::INVERSE);
-  } else {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                           "UnaryOpBuilder::AddToModelBuilderImpl, unknown op: ", op_type);
-  }
+    if (op_type == "Sqrt") {
+      layer->mutable_unary()->set_type(COREML_SPEC::UnaryFunctionLayerParams::SQRT);
+    } else if (op_type == "Reciprocal") {
+      layer->mutable_unary()->set_type(COREML_SPEC::UnaryFunctionLayerParams::INVERSE);
+    } else {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "UnaryOpBuilder::AddToModelBuilderImpl, unknown op: ", op_type);
+    }
 
-  *layer->mutable_input()->Add() = input_defs[0]->Name();
-  *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();
+    *layer->mutable_input()->Add() = input_defs[0]->Name();
+    *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name();
 
-  model_builder.AddLayer(std::move(layer));
+    model_builder.AddLayer(std::move(layer));
   }
   return Status::OK();
 }
diff --git a/onnxruntime/test/providers/coreml/coreml_basic_test.cc b/onnxruntime/test/providers/coreml/coreml_basic_test.cc
index c9d8a605678b..4fdfd2dc1be2 100644
--- a/onnxruntime/test/providers/coreml/coreml_basic_test.cc
+++ b/onnxruntime/test/providers/coreml/coreml_basic_test.cc
@@ -259,35 +259,35 @@ TEST(CoreMLExecutionProviderTest, TestNameSanitization) {
 }
 
 TEST(CoreMLExecutionProviderTest, TestBinaryFp16) {
-  auto test_binary_op = [](std::string op){
+  auto test_binary_op = [](std::string op) {
     OpTester test(op, 11);
 
     std::vector<int64_t> dims{3, 3};
     std::vector<float> input1 = {-1.0f, 0.0f, 1.0f,
-                          -6.0f, 0.0f, 6.0f,
-                          -5.4f, 2.0f, 6.0f};
+                                 -6.0f, 0.0f, 6.0f,
+                                 -5.4f, 2.0f, 6.0f};
     std::vector<MLFloat16> input1_fp16(9);
     ConvertFloatToMLFloat16(input1.data(), input1_fp16.data(), 9);
     std::vector<float> input2 = {-1.0f, 0.0f, 1.0f,
-                          -5.0f, 0.0f, 5.0f,
-                          -5.0f, 2.0f, 5.0f};
+                                 -5.0f, 0.0f, 5.0f,
+                                 -5.0f, 2.0f, 5.0f};
     std::vector<MLFloat16> input2_fp16(9);
     ConvertFloatToMLFloat16(input2.data(), input2_fp16.data(), 9);
     std::vector<float> output(9);
-    if (op == "Add"){
-      for(int i = 0; i < 9; i++){
+    if (op == "Add") {
+      for (int i = 0; i < 9; i++) {
         output[i] = input1_fp16[i] + input2_fp16[i];
       }
     } else if (op == "Sub") {
-      for(int i = 0; i < 9; i++){
+      for (int i = 0; i < 9; i++) {
         output[i] = input1_fp16[i] - input2_fp16[i];
       }
     } else if (op == "Mul") {
-      for(int i = 0; i < 9; i++){
+      for (int i = 0; i < 9; i++) {
         output[i] = input1_fp16[i] * input2_fp16[i];
       }
     } else if (op == "Div") {
-      for(int i = 0; i < 9; i++){
+      for (int i = 0; i < 9; i++) {
         output[i] = input1_fp16[i] / input2_fp16[i];
       }
     }
@@ -310,24 +310,24 @@ TEST(CoreMLExecutionProviderTest, TestBinaryFp16) {
 }
 
 TEST(CoreMLExecutionProviderTest, TestUnaryFp16) {
-  auto test_binary_op = [](std::string op){
+  auto test_binary_op = [](std::string op) {
     OpTester test(op, 11);
 
     std::vector<int64_t> dims{3, 3};
     std::vector<float> input1 = {-1.0f, 0.0f, 1.0f,
-                          -6.0f, 0.2f, 6.0f,
-                          -5.4f, 2.0f, 6.0f};
+                                 -6.0f, 0.2f, 6.0f,
+                                 -5.4f, 2.0f, 6.0f};
     std::vector<MLFloat16> input1_fp16(9);
     ConvertFloatToMLFloat16(input1.data(), input1_fp16.data(), 9);
 
     std::vector<float> output(9);
-    if (op == "Sqrt"){
-      for(int i = 0; i < 9; i++){
+    if (op == "Sqrt") {
+      for (int i = 0; i < 9; i++) {
         output[i] = sqrt(input1_fp16[i]);
       }
     } else if (op == "Reciprocal") {
-      for(int i = 0; i < 9; i++){
-        output[i] = 1.0f/(1e-4+input1_fp16[i]);
+      for (int i = 0; i < 9; i++) {
+        output[i] = 1.0f / (1e-4 + input1_fp16[i]);
       }
     }
     std::vector<MLFloat16> output_fp16(9);

From 0611bf5dc0355a076766c1ed5ef748a43b5fcafa Mon Sep 17 00:00:00 2001
From: wejoncy <wejoncy@163.com>
Date: Wed, 11 Sep 2024 04:24:31 -0700
Subject: [PATCH 4/6] more ops

---
 .../core/providers/coreml/builders/impl/base_op_builder.cc   | 4 +++-
 .../core/providers/coreml/builders/impl/unary_op_builder.cc  | 4 ++--
 onnxruntime/test/providers/coreml/coreml_basic_test.cc       | 5 +++++
 3 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc
index a261dbb63d07..f267dc755135 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc
@@ -14,7 +14,9 @@ namespace onnxruntime {
 namespace coreml {
 
 static std::set<const std::string> Float16Ops = {
-    "Add", "Mul", "Sub", "Div", "Pow", "Sqrt", "Reciprocal"};
+    "Add", "Mul", "Sub", "Div", "Pow", "Sqrt", "Reciprocal",
+    "Sigmoid", "Tanh", "Relu", "LeakyRelu", "Concat", "GridSample", "GlobalAveragePool",
+    "GlobalMaxPool", "AveragePool", "MaxPool", "Reshape", "Split", "Transpose"};
 
 namespace {
 // TODO, move this to shared_library
diff --git a/onnxruntime/core/providers/coreml/builders/impl/unary_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/unary_op_builder.cc
index 6d46c3789dec..aa3060d62686 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/unary_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/unary_op_builder.cc
@@ -48,8 +48,8 @@ Status UnaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
     AddOperationOutput(*op, *node.OutputDefs()[0]);
 
     model_builder.AddOperation(std::move(op));
-  } else
-#endif  // defined (COREML_ENABLE_MLPROGRAM)
+  } else  // NOLINT
+#endif    // defined (COREML_ENABLE_MLPROGRAM)
   {
     std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
 
diff --git a/onnxruntime/test/providers/coreml/coreml_basic_test.cc b/onnxruntime/test/providers/coreml/coreml_basic_test.cc
index 4fdfd2dc1be2..6da88a24bd45 100644
--- a/onnxruntime/test/providers/coreml/coreml_basic_test.cc
+++ b/onnxruntime/test/providers/coreml/coreml_basic_test.cc
@@ -329,6 +329,10 @@ TEST(CoreMLExecutionProviderTest, TestUnaryFp16) {
       for (int i = 0; i < 9; i++) {
         output[i] = 1.0f / (1e-4 + input1_fp16[i]);
       }
+    } else if (op == "Relu") {
+      for (int i = 0; i < 9; i++) {
+        output[i] = fmax(0.0f, input1_fp16[i]);
+      }
     }
     std::vector<MLFloat16> output_fp16(9);
     ConvertFloatToMLFloat16(output.data(), output_fp16.data(), 9);
@@ -343,6 +347,7 @@ TEST(CoreMLExecutionProviderTest, TestUnaryFp16) {
   };
   test_binary_op("Sqrt");
   test_binary_op("Reciprocal");
+  test_binary_op("Relu");
 }
 #endif
 

From 3944fd606ea96254d64f09b40e6b918f07f777f7 Mon Sep 17 00:00:00 2001
From: wejoncy <wejoncy@163.com>
Date: Wed, 11 Sep 2024 20:24:31 -0700
Subject: [PATCH 5/6] fix

---
 .../core/providers/coreml/builders/impl/base_op_builder.cc    | 4 ++++
 .../core/providers/coreml/builders/impl/base_op_builder.h     | 2 +-
 .../core/providers/coreml/builders/impl/unary_op_builder.cc   | 1 -
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc
index f267dc755135..25d7890faeba 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc
@@ -13,6 +13,8 @@ using namespace CoreML::Specification;
 namespace onnxruntime {
 namespace coreml {
 
+// Once all ops are supportted FP16, we can remove it. Before that, we keep a set of ops to
+// filter suppported ones.
 static std::set<const std::string> Float16Ops = {
     "Add", "Mul", "Sub", "Div", "Pow", "Sqrt", "Reciprocal",
     "Sigmoid", "Tanh", "Relu", "LeakyRelu", "Concat", "GridSample", "GlobalAveragePool",
@@ -110,11 +112,13 @@ bool BaseOpBuilder::IsInputDtypeSupport(const Node& node, size_t idx, const OpBu
     return true;
   }
 
+// only support MLProgram for FP16
 #if defined(COREML_ENABLE_MLPROGRAM)
   if (input_params.create_mlprogram && input_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16 && Float16Ops.count(node.OpType())) {
     return true;
   }
 #endif
+
   LOGS(logger, VERBOSE) << "[" << node.OpType() << "] Input type: [" << input_type << "] is not currently supported";
   return false;
 }
diff --git a/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.h b/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.h
index a2cbef6dd57d..153ae841b238 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.h
+++ b/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.h
@@ -32,7 +32,7 @@ class BaseOpBuilder : public IOpBuilder {
       : allow_empty_tensor_as_input_(allow_empty_tensor_as_input) {
   }
 
-  // currently we only support float
+  // currently we support float/float16
   static bool IsInputDtypeSupport(const Node& node, size_t idx, const OpBuilderInputParams& input_params,
                                   const logging::Logger& logger);
 
diff --git a/onnxruntime/core/providers/coreml/builders/impl/unary_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/unary_op_builder.cc
index aa3060d62686..e8a138aa4979 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/unary_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/unary_op_builder.cc
@@ -27,7 +27,6 @@ Status UnaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
   if (model_builder.CreateMLProgram()) {
     using namespace CoreML::Specification::MILSpec;
 
-    // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#module-coremltools.converters.mil.mil.ops.defs.iOS15.elementwise_binary
     std::string_view coreml_op_type;
     if (op_type == "Sqrt") {
       coreml_op_type = "sqrt";

From 4f935e765e1a6d5ce1945f3daeab7e1a7bd519dc Mon Sep 17 00:00:00 2001
From: wejoncy <wejoncy@163.com>
Date: Wed, 18 Sep 2024 01:07:49 -0700
Subject: [PATCH 6/6] unify UT

---
 .../coreml/builders/impl/base_op_builder.cc   |   7 +-
 .../coreml/builders/impl/builder_utils.cc     |  18 +++
 .../coreml/builders/impl/unary_op_builder.cc  |   7 +-
 .../coreml/builders/model_builder.cc          |   8 +
 .../providers/coreml/builders/model_builder.h |   3 +-
 .../providers/coreml/coreml_basic_test.cc     |  92 ------------
 .../cpu/math/element_wise_ops_test.cc         | 137 +++++++++++-------
 .../apple/coreml_supported_mlprogram_ops.md   |   2 +
 8 files changed, 123 insertions(+), 151 deletions(-)

diff --git a/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc
index 25d7890faeba..748fe1dad226 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc
@@ -15,9 +15,9 @@ namespace coreml {
 
 // Once all ops are supportted FP16, we can remove it. Before that, we keep a set of ops to
 // filter suppported ones.
-static std::set<const std::string> Float16Ops = {
+static std::set<std::string> Float16Ops = {
     "Add", "Mul", "Sub", "Div", "Pow", "Sqrt", "Reciprocal",
-    "Sigmoid", "Tanh", "Relu", "LeakyRelu", "Concat", "GridSample", "GlobalAveragePool",
+    "Sigmoid", "Tanh", "Relu", "LeakyRelu", "Concat", "GridSample", "GlobalAveragePool", "Clip", "DepthToSpace", "Resize", "Slice",
     "GlobalMaxPool", "AveragePool", "MaxPool", "Reshape", "Split", "Transpose"};
 
 namespace {
@@ -91,7 +91,8 @@ bool BaseOpBuilder::HasSupportedInputs(const Node& node, const OpBuilderInputPar
 }
 
 /* static */
-bool BaseOpBuilder::IsInputDtypeSupport(const Node& node, size_t idx, const OpBuilderInputParams& input_params,
+bool BaseOpBuilder::IsInputDtypeSupport(const Node& node, size_t idx,
+                                        [[maybe_unused]] const OpBuilderInputParams& input_params,
                                         const logging::Logger& logger) {
   if (idx >= node.InputDefs().size()) {
     LOGS(logger, VERBOSE) << "Input index [" << idx << "] is out of range";
diff --git a/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc b/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc
index fc6b5792f364..a27895b6e37f 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc
@@ -131,6 +131,15 @@ void CreateCoreMLWeightConvertingDataToFloats(CoreML::Specification::WeightParam
                  [](T v) { return narrow<float>(v); });
   *weight.mutable_floatvalue() = std::move(weight_floats);
 }
+
+template <typename T>
+void CreateCoreMLWeightConvertingDataToFloat16s(CoreML::Specification::WeightParams& weight, gsl::span<const T> data) {
+  std::vector<MLFloat16> weight_float16s{};
+  weight_float16s.reserve(data.size());
+  std::transform(data.begin(), data.end(), std::back_inserter(weight_float16s),
+                 [](T v) { return MLFloat16(narrow<float>(v)); });
+  CreateCoreMLWeight(weight, weight_float16s);
+}
 }  // namespace
 
 void CreateCoreMLWeight(CoreML::Specification::WeightParams& weight, gsl::span<const int32_t> data) {
@@ -203,6 +212,13 @@ void CopyDataToTensorValue<float>(MILSpec::TensorValue& tensor_value, gsl::span<
   tensor_value.mutable_floats()->mutable_values()->Add(data.begin(), data.end());
 }
 
+template <>
+void CopyDataToTensorValue<MLFloat16>(MILSpec::TensorValue& tensor_value, gsl::span<const MLFloat16> data) {
+  const char* begin = (const char*)(data.data());
+  const char* end = (const char*)(data.data()) + data.size() * sizeof(MLFloat16);
+  tensor_value.mutable_bytes()->mutable_values()->assign(begin, end);
+}
+
 template <>
 void CopyDataToTensorValue<int32_t>(MILSpec::TensorValue& tensor_value, gsl::span<const int32_t> data) {
   tensor_value.mutable_ints()->mutable_values()->Add(data.begin(), data.end());
@@ -300,6 +316,8 @@ template MILSpec::Value CreateTensorValue<int64_t, int32_t>(gsl::span<const int6
                                                             std::optional<gsl::span<const int64_t>> shape);
 template MILSpec::Value CreateTensorValue<float, float>(gsl::span<const float> data,
                                                         std::optional<gsl::span<const int64_t>> shape);
+template MILSpec::Value CreateTensorValue<MLFloat16, MLFloat16>(gsl::span<const MLFloat16> data,
+                                                                std::optional<gsl::span<const int64_t>> shape);
 template MILSpec::Value CreateTensorValue<bool, bool>(gsl::span<const bool> data,
                                                       std::optional<gsl::span<const int64_t>> shape);
 template MILSpec::Value CreateTensorValue<std::string, std::string>(gsl::span<const std::string> data,
diff --git a/onnxruntime/core/providers/coreml/builders/impl/unary_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/unary_op_builder.cc
index e8a138aa4979..335ca737081b 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/unary_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/unary_op_builder.cc
@@ -41,7 +41,12 @@ Status UnaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
     AddOperationInput(*op, "x", input_defs[0]->Name());
     if (op_type == "Reciprocal") {
       float epsilon = 1e-4;  // epsilon: const T (Optional, default=1e-4)
-      AddOperationInput(*op, "epsilon", model_builder.AddScalarConstant(op->type(), "epsilon", epsilon));
+      auto dtype = node.InputDefs()[0]->TypeAsProto()->tensor_type().elem_type();
+      if (dtype == ONNX_NAMESPACE::TensorProto_DataType_FLOAT) {
+        AddOperationInput(*op, "epsilon", model_builder.AddScalarConstant(op->type(), "epsilon", epsilon));
+      } else if (dtype == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16) {
+        AddOperationInput(*op, "epsilon", model_builder.AddScalarConstant(op->type(), "epsilon", MLFloat16(epsilon)));
+      }
     }
 
     AddOperationOutput(*op, *node.OutputDefs()[0]);
diff --git a/onnxruntime/core/providers/coreml/builders/model_builder.cc b/onnxruntime/core/providers/coreml/builders/model_builder.cc
index 7ecfad8493ea..50faebf06875 100644
--- a/onnxruntime/core/providers/coreml/builders/model_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/model_builder.cc
@@ -639,6 +639,14 @@ std::string_view ModelBuilder::AddConstantImpl(std::string_view op_type, std::st
   return AddTensorValueAsConstantOperation(op_type, value_type, std::move(input_value));
 }
 
+template <>
+std::string_view ModelBuilder::AddConstantImpl(std::string_view op_type, std::string_view value_type,
+                                               gsl::span<const MLFloat16> value,
+                                               std::optional<gsl::span<const int64_t>> shape) {
+  auto input_value = CreateTensorValue<MLFloat16>(value, shape);
+  return AddTensorValueAsConstantOperation(op_type, value_type, std::move(input_value));
+}
+
 template <>
 std::string_view ModelBuilder::AddConstantImpl(std::string_view op_type, std::string_view value_type,
                                                gsl::span<const int64_t> value,
diff --git a/onnxruntime/core/providers/coreml/builders/model_builder.h b/onnxruntime/core/providers/coreml/builders/model_builder.h
index bb791fb90290..688dccfc3530 100644
--- a/onnxruntime/core/providers/coreml/builders/model_builder.h
+++ b/onnxruntime/core/providers/coreml/builders/model_builder.h
@@ -107,11 +107,12 @@ class ModelBuilder {
   std::string_view AddConstant(std::string_view op_type, std::string_view value_type, gsl::span<const T> value,
                                std::optional<gsl::span<const int64_t>> shape = std::nullopt) {
     static_assert(std::is_same_v<T, float> ||
+                      std::is_same_v<T, MLFloat16> ||
                       std::is_same_v<T, int64_t> ||
                       std::is_same_v<T, std::string> ||
                       std::is_same_v<T, bool>,
                   // add specialization in AddConstantImpl for new types if needed
-                  "AddConstant currently supports float, int64_t, std::string and bool.");
+                  "AddConstant currently supports float/MLFloat16, int64_t, std::string and bool.");
     return AddConstantImpl(op_type, value_type, value, shape);
   }
 
diff --git a/onnxruntime/test/providers/coreml/coreml_basic_test.cc b/onnxruntime/test/providers/coreml/coreml_basic_test.cc
index 6da88a24bd45..daa24db13411 100644
--- a/onnxruntime/test/providers/coreml/coreml_basic_test.cc
+++ b/onnxruntime/test/providers/coreml/coreml_basic_test.cc
@@ -257,98 +257,6 @@ TEST(CoreMLExecutionProviderTest, TestNameSanitization) {
   // TensorRT does not support Clip opset 11 yet.
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
-
-TEST(CoreMLExecutionProviderTest, TestBinaryFp16) {
-  auto test_binary_op = [](std::string op) {
-    OpTester test(op, 11);
-
-    std::vector<int64_t> dims{3, 3};
-    std::vector<float> input1 = {-1.0f, 0.0f, 1.0f,
-                                 -6.0f, 0.0f, 6.0f,
-                                 -5.4f, 2.0f, 6.0f};
-    std::vector<MLFloat16> input1_fp16(9);
-    ConvertFloatToMLFloat16(input1.data(), input1_fp16.data(), 9);
-    std::vector<float> input2 = {-1.0f, 0.0f, 1.0f,
-                                 -5.0f, 0.0f, 5.0f,
-                                 -5.0f, 2.0f, 5.0f};
-    std::vector<MLFloat16> input2_fp16(9);
-    ConvertFloatToMLFloat16(input2.data(), input2_fp16.data(), 9);
-    std::vector<float> output(9);
-    if (op == "Add") {
-      for (int i = 0; i < 9; i++) {
-        output[i] = input1_fp16[i] + input2_fp16[i];
-      }
-    } else if (op == "Sub") {
-      for (int i = 0; i < 9; i++) {
-        output[i] = input1_fp16[i] - input2_fp16[i];
-      }
-    } else if (op == "Mul") {
-      for (int i = 0; i < 9; i++) {
-        output[i] = input1_fp16[i] * input2_fp16[i];
-      }
-    } else if (op == "Div") {
-      for (int i = 0; i < 9; i++) {
-        output[i] = input1_fp16[i] / input2_fp16[i];
-      }
-    }
-    std::vector<MLFloat16> output_fp16(9);
-    ConvertFloatToMLFloat16(output.data(), output_fp16.data(), 9);
-
-    test.AddInput<MLFloat16>("0", dims, input1_fp16);
-    test.AddInput<MLFloat16>("1.min", dims, input2_fp16);
-    test.AddOutput<MLFloat16>("3", dims, output_fp16);
-
-    // TensorRT does not support Clip opset 11 yet.
-    std::vector<std::unique_ptr<IExecutionProvider>> coreml_ep;
-    coreml_ep.emplace_back(MakeCoreMLExecutionProvider(COREML_FLAG_CREATE_MLPROGRAM));
-    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &coreml_ep);
-  };
-  test_binary_op("Add");
-  test_binary_op("Sub");
-  test_binary_op("Div");
-  test_binary_op("Mul");
-}
-
-TEST(CoreMLExecutionProviderTest, TestUnaryFp16) {
-  auto test_binary_op = [](std::string op) {
-    OpTester test(op, 11);
-
-    std::vector<int64_t> dims{3, 3};
-    std::vector<float> input1 = {-1.0f, 0.0f, 1.0f,
-                                 -6.0f, 0.2f, 6.0f,
-                                 -5.4f, 2.0f, 6.0f};
-    std::vector<MLFloat16> input1_fp16(9);
-    ConvertFloatToMLFloat16(input1.data(), input1_fp16.data(), 9);
-
-    std::vector<float> output(9);
-    if (op == "Sqrt") {
-      for (int i = 0; i < 9; i++) {
-        output[i] = sqrt(input1_fp16[i]);
-      }
-    } else if (op == "Reciprocal") {
-      for (int i = 0; i < 9; i++) {
-        output[i] = 1.0f / (1e-4 + input1_fp16[i]);
-      }
-    } else if (op == "Relu") {
-      for (int i = 0; i < 9; i++) {
-        output[i] = fmax(0.0f, input1_fp16[i]);
-      }
-    }
-    std::vector<MLFloat16> output_fp16(9);
-    ConvertFloatToMLFloat16(output.data(), output_fp16.data(), 9);
-
-    test.AddInput<MLFloat16>("0", dims, input1_fp16);
-    test.AddOutput<MLFloat16>("3", dims, output_fp16);
-
-    // TensorRT does not support Clip opset 11 yet.
-    std::vector<std::unique_ptr<IExecutionProvider>> coreml_ep;
-    coreml_ep.emplace_back(MakeCoreMLExecutionProvider(COREML_FLAG_CREATE_MLPROGRAM));
-    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &coreml_ep);
-  };
-  test_binary_op("Sqrt");
-  test_binary_op("Reciprocal");
-  test_binary_op("Relu");
-}
 #endif
 
 }  // namespace test
diff --git a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
index bd3d21d4929f..659622a70e4c 100644
--- a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
+++ b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
@@ -22,26 +22,38 @@ std::vector<MLFloat16> MakeMLFloat16(const std::initializer_list<float>& input)
   return output;
 }
 
-#if defined(USE_CUDA) || defined(USE_ROCM)
-void TestFloat16(const char* op_name, const std::vector<int64_t>& lhs_dim,
-                 const std::initializer_list<float>& lhs_values, const std::vector<int64_t>& rhs_dim,
-                 const std::initializer_list<float>& rhs_values, const std::vector<int64_t>& out_dim,
-                 const std::initializer_list<float>& out_values) {
+void TestBinaryFloat16(const char* op_name, const std::vector<int64_t>& lhs_dim,
+                       const std::initializer_list<float>& lhs_values, const std::vector<int64_t>& rhs_dim,
+                       const std::initializer_list<float>& rhs_values, const std::vector<int64_t>& out_dim,
+                       const std::initializer_list<float>& out_values, bool enable_bf16 = true) {
+  ORT_UNUSED_PARAMETER(op_name);
+  ORT_UNUSED_PARAMETER(lhs_dim);
+  ORT_UNUSED_PARAMETER(lhs_values);
+  ORT_UNUSED_PARAMETER(rhs_dim);
+  ORT_UNUSED_PARAMETER(rhs_values);
+  ORT_UNUSED_PARAMETER(out_dim);
+  ORT_UNUSED_PARAMETER(out_values);
+  ORT_UNUSED_PARAMETER(enable_bf16);
+#if defined(USE_CUDA) || defined(USE_ROCM) || defined(COREML_ENABLE_MLPROGRAM)
   {
     OpTester tester(op_name, 14);
     tester.AddInput<MLFloat16>("A", lhs_dim, MakeMLFloat16(lhs_values));
     tester.AddInput<MLFloat16>("B", rhs_dim, MakeMLFloat16(rhs_values));
     tester.AddOutput<MLFloat16>("C", out_dim, MakeMLFloat16(out_values));
     std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
-#ifdef USE_CUDA
+#ifdef COREML_ENABLE_MLPROGRAM
+    execution_providers.push_back(DefaultCoreMLExecutionProvider(true));
+#elif USE_CUDA
     execution_providers.push_back(DefaultCudaExecutionProvider());
 #elif USE_ROCM
     execution_providers.push_back(DefaultRocmExecutionProvider());
 #endif
     tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
   }
+#endif
 
-  {
+#if defined(USE_CUDA) || defined(USE_ROCM)
+  if (enable_bf16) {
     OpTester tester(op_name, 14);
     tester.AddInput<BFloat16>("A", lhs_dim, MakeBFloat16(lhs_values));
     tester.AddInput<BFloat16>("B", rhs_dim, MakeBFloat16(rhs_values));
@@ -54,9 +66,52 @@ void TestFloat16(const char* op_name, const std::vector<int64_t>& lhs_dim,
 #endif
     tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
   }
+#endif
 }
+
+void TestUnaryFloat16(const char* op_name, const std::vector<int64_t>& lhs_dim,
+                      const std::initializer_list<float>& lhs_values, const std::vector<int64_t>& out_dim,
+                      const std::initializer_list<float>& out_values, int opset = 14) {
+  ORT_UNUSED_PARAMETER(op_name);
+  ORT_UNUSED_PARAMETER(lhs_dim);
+  ORT_UNUSED_PARAMETER(lhs_values);
+  ORT_UNUSED_PARAMETER(rhs_dim);
+  ORT_UNUSED_PARAMETER(out_dim);
+  ORT_UNUSED_PARAMETER(out_values);
+  ORT_UNUSED_PARAMETER(opset);
+#if defined(USE_CUDA) || defined(USE_ROCM) || defined(COREML_ENABLE_MLPROGRAM)
+  {
+    OpTester tester(op_name, opset);
+    tester.AddInput<MLFloat16>("A", lhs_dim, MakeMLFloat16(lhs_values));
+    tester.AddOutput<MLFloat16>("C", out_dim, MakeMLFloat16(out_values));
+    std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+#ifdef COREML_ENABLE_MLPROGRAM
+    execution_providers.push_back(DefaultCoreMLExecutionProvider(true));
+#elif USE_CUDA
+    execution_providers.push_back(DefaultCudaExecutionProvider());
+#elif USE_ROCM
+    execution_providers.push_back(DefaultRocmExecutionProvider());
+#endif
+    tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+  }
 #endif
 
+#if defined(USE_CUDA) || defined(USE_ROCM)
+  {
+    OpTester tester(op_name, opset);
+    tester.AddInput<BFloat16>("A", lhs_dim, MakeBFloat16(lhs_values));
+    tester.AddOutput<BFloat16>("C", out_dim, MakeBFloat16(out_values));
+    std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+#ifdef USE_CUDA
+    execution_providers.push_back(DefaultCudaExecutionProvider());
+#elif USE_ROCM
+    execution_providers.push_back(DefaultRocmExecutionProvider());
+#endif
+    tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+  }
+#endif
+}
+
 void TestBFloat16(const char* op_name, const std::vector<int64_t>& lhs_dim,
                   const std::initializer_list<float>& lhs_values, const std::vector<int64_t>& rhs_dim,
                   const std::initializer_list<float>& rhs_values, const std::vector<int64_t>& out_dim,
@@ -163,9 +218,7 @@ TEST(MathOpTest, Add_float) {
   test.Run();
 #endif
 
-#if defined(USE_CUDA) || defined(USE_ROCM)
-  TestFloat16("Add", dims, lhs_values, dims, rhs_values, dims, out_values);
-#endif
+  TestBinaryFloat16("Add", dims, lhs_values, dims, rhs_values, dims, out_values);
 
 #if defined(USE_DNNL)
   TestBFloat16("Add", dims, lhs_values, dims, rhs_values, dims, out_values);
@@ -202,9 +255,7 @@ TEST(MathOpTest, Add_Broadcast_Axis) {
   test.AddOutput<float>("C", dims, out_values);
   test.Run(OpTester::ExpectResult::kExpectSuccess, "");
 
-#if defined(USE_CUDA) || defined(USE_ROCM)
-  TestFloat16("Add", dims, lhs_values, {3, 1}, rhs_values, dims, out_values);
-#endif
+  TestBinaryFloat16("Add", dims, lhs_values, {3, 1}, rhs_values, dims, out_values);
 
 #if defined(USE_DNNL)
   TestBFloat16("Add", dims, lhs_values, {3, 1}, rhs_values, dims, out_values);
@@ -228,9 +279,7 @@ TEST(MathOpTest, Add_Broadcast_MultidirectionalAB) {
            {kTensorrtExecutionProvider});  // TensorRT: got C with shape [3, 1]
 #endif
 
-#if defined(USE_CUDA) || defined(USE_ROCM)
-  TestFloat16("Add", {3, 1}, lhs_values, {3}, rhs_values, {3, 3}, out_values);
-#endif
+  TestBinaryFloat16("Add", {3, 1}, lhs_values, {3}, rhs_values, {3, 3}, out_values);
 
 #if defined(USE_DNNL)
   TestBFloat16("Add", {3, 1}, lhs_values, {3}, rhs_values, {3, 3}, out_values);
@@ -254,9 +303,7 @@ TEST(MathOpTest, Add_Broadcast_MultidirectionalBA) {
            {kTensorrtExecutionProvider});  // TensorRT: got C with shape [3, 1]
 #endif
 
-#if defined(USE_CUDA) || defined(USE_ROCM)
-  TestFloat16("Add", {3}, lhs_values, {3, 1}, rhs_values, {3, 3}, out_values);
-#endif
+  TestBinaryFloat16("Add", {3}, lhs_values, {3, 1}, rhs_values, {3, 3}, out_values);
 
 #if defined(USE_DNNL)
   TestBFloat16("Add", {3}, lhs_values, {3, 1}, rhs_values, {3, 3}, out_values);
@@ -527,9 +574,7 @@ TEST(MathOpTest, Sub) {
   test.AddOutput<float>("C", dims, out_values);
   test.Run();
 
-#if defined(USE_CUDA) || defined(USE_ROCM)
-  TestFloat16("Sub", dims, lhs_values, dims, rhs_values, dims, out_values);
-#endif
+  TestBinaryFloat16("Sub", dims, lhs_values, dims, rhs_values, dims, out_values);
 
 #if defined(USE_DNNL)
   TestBFloat16("Sub", dims, lhs_values, dims, rhs_values, dims, out_values);
@@ -584,9 +629,7 @@ TEST(MathOpTest, Mul) {
 
   test.Run();
 
-#if defined(USE_CUDA) || defined(USE_ROCM)
-  TestFloat16("Mul", dims, lhs_values, dims, rhs_values, dims, out_values);
-#endif
+  TestBinaryFloat16("Mul", dims, lhs_values, dims, rhs_values, dims, out_values);
 
 #if defined(USE_DNNL)
   TestBFloat16("Mul", dims, lhs_values, dims, rhs_values, dims, out_values);
@@ -622,9 +665,7 @@ TEST(MathOpTest, Div) {
   test.AddOutput<float>("C", dims, out_values);
   test.Run();
 
-#if defined(USE_CUDA) || defined(USE_ROCM)
-  TestFloat16("Div", dims, lhs_values, dims, rhs_values, dims, out_values);
-#endif
+  TestBinaryFloat16("Div", dims, lhs_values, dims, rhs_values, dims, out_values);
 
 #if defined(USE_DNNL)
   TestBFloat16("Div", dims, lhs_values, dims, rhs_values, dims, out_values);
@@ -772,13 +813,12 @@ TEST(MathOpTest, Ceil_double) {
 TEST(MathOpTest, Reciprocal) {
   OpTester test("Reciprocal");
   std::vector<int64_t> dims{2, 2};
-  test.AddInput<float>("X", dims,
-                       {1.0f, 2.0f,
-                        -1.0f, -2.0f});
-  test.AddOutput<float>("Y", dims,
-                        {1.0f, 0.5f,
-                         -1.0f, -0.5f});
+  std::initializer_list<float> inputs = {1.0f, 2.0f, -1.0f, -2.0f};
+  std::initializer_list<float> outputs = {1.0f, 0.5f, -1.0f, -0.5f};
+  test.AddInput<float>("X", dims, inputs);
+  test.AddOutput<float>("Y", dims, outputs);
   test.Run();
+  TestUnaryFloat16("Reciprocal", dims, inputs, dims, outputs, 12);
 }
 
 TEST(MathOpTest, Reciprocal_double) {
@@ -795,14 +835,13 @@ TEST(MathOpTest, Reciprocal_double) {
 
 TEST(MathOpTest, Sqrt_Float) {
   OpTester test("Sqrt");
+  std::initializer_list<float> inputs = {1.0f, 4.0f, 0.0f, 9.0f};
+  std::initializer_list<float> outputs = {1.0f, 2.0f, 0.0f, 3.0f};
   std::vector<int64_t> dims{2, 2};
-  test.AddInput<float>("X", dims,
-                       {1.0f, 4.0f,
-                        0.0f, 9.0f});
-  test.AddOutput<float>("Y", dims,
-                        {1.0f, 2.0f,
-                         0.0f, 3.0f});
+  test.AddInput<float>("X", dims, inputs);
+  test.AddOutput<float>("Y", dims, outputs);
   test.Run();
+  TestUnaryFloat16("Sqrt", dims, inputs, dims, outputs);
 }
 
 #if defined(USE_DNNL) || defined(USE_CUDA)
@@ -1056,24 +1095,13 @@ TEST(MathOpTest, Pow_double_int64) {
   test.Run();
 }
 
-#if defined(USE_CUDA) || defined(USE_ROCM)
 TEST(MathOpTest, Pow_float16_float16) {
-  OpTester test("Pow", 12);
   std::vector<int64_t> dims{4};
-
-  test.AddInput<MLFloat16>("X", dims, MakeMLFloat16({2.0f, 2.0f, std::sqrt(2.0f), 1.0f}));
-  test.AddInput<MLFloat16>("Y", dims, MakeMLFloat16({0.0f, 8.0f, 2.0f, 9.0f}));
-  test.AddOutput<MLFloat16>("Z", dims, MakeMLFloat16({1.0f, 256.0f, 2.0f, 1.0f}));
-
-  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
-#ifdef USE_CUDA
-  execution_providers.push_back(DefaultCudaExecutionProvider());
-#elif USE_ROCM
-  execution_providers.push_back(DefaultRocmExecutionProvider());
-#endif
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+  TestBinaryFloat16("Pow", dims, {2.0f, 2.0f, std::sqrt(2.0f), 1.0f}, dims, {0.0f, 8.0f, 2.0f, 9.0f},
+                    dims, {1.0f, 256.0f, 2.0f, 1.0f}, false);
 }
 
+#if defined(USE_CUDA) || defined(USE_ROCM)
 TEST(MathOpTest, Pow_float_float16) {
   OpTester test("Pow", 12);
   std::vector<int64_t> dims{4};
@@ -3660,5 +3688,6 @@ TEST(MathOpTest, BitwiseNot_uint8) {
   test.AddOutput<uint8_t>("Y", dims, {254, 251, 250, 252});
   test.Run();
 }
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md b/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md
index bb4cfb2e09dc..0b51311e2271 100644
--- a/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md
+++ b/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md
@@ -29,3 +29,5 @@ Keep in sync with doco generated from /docs/execution-providers/CoreML-Execution
 |ai.onnx:Sigmoid||
 |ai.onnx:Tanh||
 |ai.onnx:Transpose||
+|ai.onnx:Sqrt||
+|ai.onnx:Reciprocal|this ask for a `epislon` (default 1e-4) where onnx don't provide|