F16->F32 Convert operator

PiperOrigin-RevId: 405275901
diff --git a/BUILD.bazel b/BUILD.bazel
index dffb344..1eee4bd 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -122,6 +122,8 @@
 ]
 
 PROD_SCALAR_MICROKERNEL_SRCS = [
+    "src/f16-f32-vcvt/gen/vcvt-scalar-float-x1.c",
+    "src/f16-f32-vcvt/gen/vcvt-scalar-float-x4.c",
     "src/f32-argmaxpool/4x-scalar-c1.c",
     "src/f32-argmaxpool/9p8x-scalar-c1.c",
     "src/f32-argmaxpool/9x-scalar-c1.c",
@@ -1999,6 +2001,7 @@
 
 # ISA-specific micro-kernels
 PROD_NEON_MICROKERNEL_SRCS = [
+    "src/f16-f32-vcvt/gen/vcvt-neon-int16-x16.c",
     "src/f32-argmaxpool/4x-neon-c4.c",
     "src/f32-argmaxpool/9p8x-neon-c4.c",
     "src/f32-argmaxpool/9x-neon-c4.c",
@@ -2721,6 +2724,7 @@
 ]
 
 PROD_NEONFP16_MICROKERNEL_SRCS = [
+    "src/f16-f32-vcvt/gen/vcvt-neonfp16-x16.c",
 ]
 
 ALL_NEONFP16_MICROKERNEL_SRCS = [
@@ -3659,6 +3663,7 @@
 ]
 
 PROD_SSE2_MICROKERNEL_SRCS = [
+    "src/f16-f32-vcvt/gen/vcvt-sse2-int16-x32.c",
     "src/f32-argmaxpool/4x-sse2-c4.c",
     "src/f32-argmaxpool/9p8x-sse2-c4.c",
     "src/f32-argmaxpool/9x-sse2-c4.c",
@@ -4055,6 +4060,7 @@
 ]
 
 PROD_SSE41_MICROKERNEL_SRCS = [
+    "src/f16-f32-vcvt/gen/vcvt-sse41-int16-x16.c",
     "src/f32-prelu/gen/sse41-2x8.c",
     "src/f32-vlrelu/gen/vlrelu-sse41-x8.c",
     "src/f32-vrnd/gen/vrndd-sse41-x8.c",
@@ -4346,6 +4352,7 @@
 ]
 
 PROD_AVX_MICROKERNEL_SRCS = [
+    "src/f16-f32-vcvt/gen/vcvt-avx-int16-x16.c",
     "src/f32-dwconv/gen/up8x25-minmax-avx.c",
     "src/f32-dwconv/gen/up16x4-minmax-avx.c",
     "src/f32-dwconv/gen/up16x9-minmax-avx.c",
@@ -4748,6 +4755,7 @@
 ]
 
 PROD_F16C_MICROKERNEL_SRCS = [
+    "src/f16-f32-vcvt/gen/vcvt-f16c-x16.c",
 ]
 
 ALL_F16C_MICROKERNEL_SRCS = [
@@ -5594,6 +5602,7 @@
 ]
 
 PROD_AVX512SKX_MICROKERNEL_SRCS = [
+    "src/f16-f32-vcvt/gen/vcvt-avx512skx-x16.c",
     "src/qc8-dwconv/gen/up32x9-minmax-fp32-avx512skx-mul32.c",
     "src/qc8-dwconv/gen/up32x25-minmax-fp32-avx512skx-mul32.c",
     "src/qc8-gemm/gen/1x16c8-minmax-fp32-avx512skx.c",
@@ -10318,6 +10327,15 @@
 )
 
 xnnpack_unit_test(
+    name = "convert_nc_test",
+    srcs = [
+        "test/convert-nc.cc",
+        "test/convert-operator-tester.h",
+    ],
+    deps = OPERATOR_TEST_DEPS,
+)
+
+xnnpack_unit_test(
     name = "convolution_nhwc_test",
     timeout = "moderate",
     srcs = [
diff --git a/CMakeLists.txt b/CMakeLists.txt
index cb61805..d147005 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -250,6 +250,8 @@
   src/tables/exp2minus-k-over-2048.c)
 
 SET(PROD_SCALAR_MICROKERNEL_SRCS
+  src/f16-f32-vcvt/gen/vcvt-scalar-float-x1.c
+  src/f16-f32-vcvt/gen/vcvt-scalar-float-x4.c
   src/f32-argmaxpool/4x-scalar-c1.c
   src/f32-argmaxpool/9p8x-scalar-c1.c
   src/f32-argmaxpool/9x-scalar-c1.c
@@ -1052,6 +1054,7 @@
   src/xx-pad/scalar.c)
 
 SET(PROD_NEON_MICROKERNEL_SRCS
+  src/f16-f32-vcvt/gen/vcvt-neon-int16-x16.c
   src/f32-argmaxpool/4x-neon-c4.c
   src/f32-argmaxpool/9p8x-neon-c4.c
   src/f32-argmaxpool/9x-neon-c4.c
@@ -1771,6 +1774,9 @@
   src/xx-fill/neon-x64.c
   src/xx-pad/neon.c)
 
+SET(PROD_NEONFP16_MICROKERNEL_SRCS
+  src/f16-f32-vcvt/gen/vcvt-neonfp16-x16.c)
+
 SET(ALL_NEONFP16_MICROKERNEL_SRCS
   src/f16-f32-vcvt/gen/vcvt-neonfp16-x8.c
   src/f16-f32-vcvt/gen/vcvt-neonfp16-x16.c
@@ -2694,6 +2700,7 @@
   src/x32-packx/x4-sse.c)
 
 SET(PROD_SSE2_MICROKERNEL_SRCS
+  src/f16-f32-vcvt/gen/vcvt-sse2-int16-x32.c
   src/f32-argmaxpool/4x-sse2-c4.c
   src/f32-argmaxpool/9p8x-sse2-c4.c
   src/f32-argmaxpool/9x-sse2-c4.c
@@ -3086,6 +3093,7 @@
   src/x8-lut/gen/lut-ssse3-x32.c)
 
 SET(PROD_SSE41_MICROKERNEL_SRCS
+  src/f16-f32-vcvt/gen/vcvt-sse41-int16-x16.c
   src/f32-prelu/gen/sse41-2x8.c
   src/f32-vlrelu/gen/vlrelu-sse41-x8.c
   src/f32-vrnd/gen/vrndd-sse41-x8.c
@@ -3375,6 +3383,7 @@
   src/s8-vclamp/sse41-x64.c)
 
 SET(PROD_AVX_MICROKERNEL_SRCS
+  src/f16-f32-vcvt/gen/vcvt-avx-int16-x16.c
   src/f32-dwconv/gen/up8x25-minmax-avx.c
   src/f32-dwconv/gen/up16x4-minmax-avx.c
   src/f32-dwconv/gen/up16x9-minmax-avx.c
@@ -3772,6 +3781,9 @@
   src/x8-lut/gen/lut-avx-x48.c
   src/x8-lut/gen/lut-avx-x64.c)
 
+SET(PROD_F16C_MICROKERNEL_SRCS
+  src/f16-f32-vcvt/gen/vcvt-f16c-x16.c)
+
 SET(ALL_F16C_MICROKERNEL_SRCS
   src/f16-f32-vcvt/gen/vcvt-f16c-x8.c
   src/f16-f32-vcvt/gen/vcvt-f16c-x16.c
@@ -4609,6 +4621,7 @@
   src/math/sqrt-avx512f-nr2fma.c)
 
 SET(PROD_AVX512SKX_MICROKERNEL_SRCS
+  src/f16-f32-vcvt/gen/vcvt-avx512skx-x16.c
   src/qc8-dwconv/gen/up32x9-minmax-fp32-avx512skx-mul32.c
   src/qc8-dwconv/gen/up32x25-minmax-fp32-avx512skx-mul32.c
   src/qc8-gemm/gen/1x16c8-minmax-fp32-avx512skx.c
@@ -4956,6 +4969,7 @@
 ENDIF()
 IF(XNNPACK_TARGET_PROCESSOR MATCHES "^(aarch64|arm64)$" OR IOS_ARCH MATCHES "^arm64.*")
   LIST(APPEND PROD_MICROKERNEL_SRCS ${PROD_NEON_MICROKERNEL_SRCS})
+  LIST(APPEND PROD_MICROKERNEL_SRCS ${PROD_NEONFP16_MICROKERNEL_SRCS})
   LIST(APPEND PROD_MICROKERNEL_SRCS ${PROD_NEONFMA_MICROKERNEL_SRCS})
   LIST(APPEND PROD_MICROKERNEL_SRCS ${PROD_NEONV8_MICROKERNEL_SRCS})
   LIST(APPEND PROD_MICROKERNEL_SRCS ${PROD_AARCH64_NEON_MICROKERNEL_SRCS})
@@ -4979,6 +4993,7 @@
   LIST(APPEND PROD_MICROKERNEL_SRCS ${PROD_SSSE3_MICROKERNEL_SRCS})
   LIST(APPEND PROD_MICROKERNEL_SRCS ${PROD_SSE41_MICROKERNEL_SRCS})
   LIST(APPEND PROD_MICROKERNEL_SRCS ${PROD_AVX_MICROKERNEL_SRCS})
+  LIST(APPEND PROD_MICROKERNEL_SRCS ${PROD_F16C_MICROKERNEL_SRCS})
   LIST(APPEND PROD_MICROKERNEL_SRCS ${PROD_XOP_MICROKERNEL_SRCS})
   LIST(APPEND PROD_MICROKERNEL_SRCS ${PROD_FMA3_MICROKERNEL_SRCS})
   LIST(APPEND PROD_MICROKERNEL_SRCS ${PROD_AVX2_MICROKERNEL_SRCS})
@@ -5359,6 +5374,15 @@
   TARGET_LINK_LIBRARIES(constant-pad-nd-test PRIVATE XNNPACK gtest gtest_main)
   ADD_TEST(constant-pad-nd-test constant-pad-nd-test)
 
+  ADD_EXECUTABLE(convert-nc-test test/convert-nc.cc)
+  SET_TARGET_PROPERTIES(convert-nc-test PROPERTIES
+    CXX_STANDARD 11
+    CXX_STANDARD_REQUIRED YES
+    CXX_EXTENSIONS YES)
+  TARGET_INCLUDE_DIRECTORIES(convert-nc-test PRIVATE src test)
+  TARGET_LINK_LIBRARIES(convert-nc-test PRIVATE XNNPACK fp16 gtest gtest_main)
+  ADD_TEST(convert-nc-test convert-nc-test)
+
   ADD_EXECUTABLE(convolution-nhwc-test test/convolution-nhwc.cc)
   SET_TARGET_PROPERTIES(convolution-nhwc-test PROPERTIES
     CXX_STANDARD 11
diff --git a/include/xnnpack.h b/include/xnnpack.h
index 24c6118..f6a6683 100644
--- a/include/xnnpack.h
+++ b/include/xnnpack.h
@@ -2706,6 +2706,24 @@
 
 #endif  // XNN_NO_X8_OPERATORS
 
+#ifndef XNN_NO_CVT_OPERATORS
+
+enum xnn_status xnn_create_convert_nc_f16_f32(
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  uint32_t flags,
+  xnn_operator_t* convert_op_out);
+
+enum xnn_status xnn_setup_convert_nc_f16_f32(
+  xnn_operator_t convert_op,
+  size_t batch_size,
+  const void* input,
+  float* output,
+  pthreadpool_t threadpool);
+
+#endif  // XNN_NO_CVT_OPERATORS
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/src/init.c b/src/init.c
index 37a7209..b825aba 100644
--- a/src/init.c
+++ b/src/init.c
@@ -53,6 +53,7 @@
 #include <xnnpack/unpool.h>
 #include <xnnpack/vaddsub.h>
 #include <xnnpack/vbinary.h>
+#include <xnnpack/vcvt.h>
 #include <xnnpack/vmul.h>
 #include <xnnpack/vmulcaddc.h>
 #include <xnnpack/vunary.h>
@@ -614,6 +615,17 @@
       #endif  // XNN_NO_NCHW_OPERATORS
     #endif  // XNN_NO_F32_OPERATORS
 
+    /*************************** VCVT micro-kernels ***************************/
+    #ifndef XNN_NO_VCVT_OPERATORS
+      init_flags |= XNN_INIT_FLAG_VCVT;
+
+      if (cpuinfo_has_arm_neon_fp16()) {
+        xnn_params.vcvt.f16_to_f32 = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__neonfp16_x16;
+      } else {
+        xnn_params.vcvt.f16_to_f32 = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__neon_int16_x16;
+      }
+    #endif  // XNN_NO_VCVT_OPERATORS
+
     /**************************** X32 micro-kernels ****************************/
     #ifndef XNN_NO_X32_OPERATORS
       init_flags |= XNN_INIT_FLAG_X32;
@@ -1006,6 +1018,13 @@
       #endif  // XNN_NO_NCHW_OPERATORS
     #endif  // XNN_NO_F32_OPERATORS
 
+    /*************************** VCVT micro-kernels ***************************/
+    #ifndef XNN_NO_VCVT_OPERATORS
+      init_flags |= XNN_INIT_FLAG_VCVT;
+
+      xnn_params.vcvt.f16_to_f32 = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__scalar_float_x4;
+    #endif  // XNN_NO_VCVT_OPERATORS
+
     /**************************** X32 micro-kernels ****************************/
     #ifndef XNN_NO_X32_OPERATORS
       init_flags |= XNN_INIT_FLAG_X32;
@@ -2146,6 +2165,13 @@
     #endif  // XNN_NO_NCHW_OPERATORS
   #endif  // XNN_NO_F32_OPERATORS
 
+  /*************************** VCVT micro-kernels ***************************/
+  #ifndef XNN_NO_VCVT_OPERATORS
+    init_flags |= XNN_INIT_FLAG_VCVT;
+
+    xnn_params.vcvt.f16_to_f32 = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__neonfp16_x16;
+  #endif  // XNN_NO_VCVT_OPERATORS
+
   /**************************** X32 micro-kernels ****************************/
   #ifndef XNN_NO_X32_OPERATORS
     init_flags |= XNN_INIT_FLAG_X32;
@@ -3255,6 +3281,23 @@
     #endif  // XNN_NO_NCHW_OPERATORS
   #endif  // XNN_NO_F32_OPERATORS
 
+  /*************************** VCVT micro-kernels ***************************/
+  #ifndef XNN_NO_VCVT_OPERATORS
+    init_flags |= XNN_INIT_FLAG_VCVT;
+
+    if (cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
+      xnn_params.vcvt.f16_to_f32 = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__avx512skx_x16;
+    } else if (cpuinfo_has_x86_f16c()) {
+      xnn_params.vcvt.f16_to_f32 = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__f16c_x16;
+    } else if (cpuinfo_has_x86_avx()) {
+      xnn_params.vcvt.f16_to_f32 = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__avx_int16_x16;
+    } else if (cpuinfo_has_x86_sse4_1()) {
+      xnn_params.vcvt.f16_to_f32 = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__sse41_int16_x16;
+    } else {
+      xnn_params.vcvt.f16_to_f32 = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__sse2_int16_x32;
+    }
+  #endif  // XNN_NO_VCVT_OPERATORS
+
   /**************************** X32 micro-kernels ****************************/
   #ifndef XNN_NO_X32_OPERATORS
     init_flags |= XNN_INIT_FLAG_X32;
@@ -3916,6 +3959,13 @@
     #endif  // XNN_NO_NCHW_OPERATORS
   #endif  // XNN_NO_F32_OPERATORS
 
+  /*************************** VCVT micro-kernels ***************************/
+  #ifndef XNN_NO_VCVT_OPERATORS
+    init_flags |= XNN_INIT_FLAG_VCVT;
+
+    xnn_params.vcvt.f16_to_f32 = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_x16;
+  #endif  // XNN_NO_VCVT_OPERATORS
+
   /**************************** X32 micro-kernels ****************************/
   #ifndef XNN_NO_X32_OPERATORS
     init_flags |= XNN_INIT_FLAG_X32;
@@ -4400,6 +4450,13 @@
     #endif  // XNN_NO_NCHW_OPERATORS
   #endif  // XNN_NO_F32_OPERATORS
 
+  /*************************** VCVT micro-kernels ***************************/
+  #ifndef XNN_NO_VCVT_OPERATORS
+    init_flags |= XNN_INIT_FLAG_VCVT;
+
+    xnn_params.vcvt.f16_to_f32 = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__scalar_float_x1;
+  #endif  // XNN_NO_VCVT_OPERATORS
+
   /**************************** X32 micro-kernels ****************************/
   #ifndef XNN_NO_X32_OPERATORS
     init_flags |= XNN_INIT_FLAG_X32;
diff --git a/src/operator-strings.c b/src/operator-strings.c
index ac56da4..b723e75 100644
--- a/src/operator-strings.c
+++ b/src/operator-strings.c
@@ -52,6 +52,8 @@
       return "Constant Pad (ND, X8)";
     case xnn_operator_type_constant_pad_nd_x32:
       return "Constant Pad (ND, X32)";
+    case xnn_operator_type_convert_nc_f16_f32:
+      return "Convert (NC, F16, F32)";
     case xnn_operator_type_convolution_nhwc_f16:
       return "Convolution (NHWC, F16)";
     case xnn_operator_type_convolution_nhwc_f32:
diff --git a/src/operators/unary-elementwise-nc.c b/src/operators/unary-elementwise-nc.c
index f76fe58..95aaebe 100644
--- a/src/operators/unary-elementwise-nc.c
+++ b/src/operators/unary-elementwise-nc.c
@@ -39,7 +39,7 @@
   if (channels == 0) {
     xnn_log_error(
       "failed to create %s operator with %zu channels: number of channels must be non-zero",
-      xnn_operator_type_to_string(xnn_operator_type_clamp_nc_f32), channels);
+      xnn_operator_type_to_string(operator_type), channels);
     return xnn_status_invalid_parameter;
   }
 
@@ -47,7 +47,7 @@
     xnn_log_error(
       "failed to create %s operator with input element stride of %zu: "
       "stride must be at least as large as the number of channels (%zu)",
-      xnn_operator_type_to_string(xnn_operator_type_clamp_nc_f32), input_stride, channels);
+      xnn_operator_type_to_string(operator_type), input_stride, channels);
     return xnn_status_invalid_parameter;
   }
 
@@ -55,7 +55,7 @@
     xnn_log_error(
       "failed to create %s operator with output element stride of %zu: "
       "stride must be at least as large as the number of channels (%zu)",
-      xnn_operator_type_to_string(xnn_operator_type_clamp_nc_f32), output_stride, channels);
+      xnn_operator_type_to_string(operator_type), output_stride, channels);
     return xnn_status_invalid_parameter;
   }
 
@@ -89,7 +89,8 @@
     size_t batch_size,
     const void* input,
     void* output,
-    uint32_t log2_element_size,
+    uint32_t log2_input_size,
+    uint32_t log2_output_size,
     const void* params,
     size_t params_size)
 {
@@ -114,9 +115,9 @@
     const size_t block_size = 4096;
     unary_elementwise_op->context.univector_contiguous = (struct univector_contiguous_context) {
       .x = input,
-      .x_stride = input_stride << log2_element_size,
+      .x_stride = input_stride << log2_input_size,
       .y = output,
-      .y_stride = output_stride << log2_element_size,
+      .y_stride = output_stride << log2_output_size,
       .ukernel = ukernel,
     };
     if (params_size != 0) {
@@ -124,15 +125,15 @@
     }
     unary_elementwise_op->compute.type = xnn_parallelization_type_1d_tile_1d;
     unary_elementwise_op->compute.task_1d_tile_1d = (pthreadpool_task_1d_tile_1d_t) xnn_compute_univector_contiguous;
-    unary_elementwise_op->compute.range[0] = (batch_size * channels) << log2_element_size;
+    unary_elementwise_op->compute.range[0] = (batch_size * channels) << log2_output_size;
     unary_elementwise_op->compute.tile[0] = block_size;
   } else {
     unary_elementwise_op->context.univector_strided = (struct univector_strided_context) {
-      .n = channels << log2_element_size,
+      .n = channels << log2_output_size,
       .x = input,
-      .x_stride = input_stride << log2_element_size,
+      .x_stride = input_stride << log2_input_size,
       .y = output,
-      .y_stride = output_stride << log2_element_size,
+      .y_stride = output_stride << log2_output_size,
       .ukernel = ukernel,
     };
     if (params_size != 0) {
@@ -301,6 +302,21 @@
     ceiling_op_out);
 }
 
+enum xnn_status xnn_create_convert_nc_f16_f32(
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  uint32_t flags,
+  xnn_operator_t* convert_op_out)
+{
+  return create_unary_elementwise_nc(
+    channels, input_stride, output_stride, flags,
+    NULL, 0,
+    xnn_operator_type_convert_nc_f16_f32,
+    xnn_params.vcvt.f16_to_f32,
+    convert_op_out);
+}
+
 enum xnn_status xnn_create_copy_nc_x32(
     size_t channels,
     size_t input_stride,
@@ -530,6 +546,7 @@
     abs_op,
     batch_size, input, output,
     2 /* log2(sizeof(float)) */,
+    2 /* log2(sizeof(float)) */,
     &abs_op->params.f32_abs, sizeof(abs_op->params.f32_abs));
 }
 
@@ -552,6 +569,7 @@
     rounding_op,
     batch_size, input, output,
     2 /* log2(sizeof(float)) */,
+    2 /* log2(sizeof(float)) */,
     &rounding_op->params.f32_rnd, sizeof(rounding_op->params.f32_rnd));
 }
 
@@ -574,6 +592,7 @@
     ceiling_op,
     batch_size, input, output,
     2 /* log2(sizeof(float)) */,
+    2 /* log2(sizeof(float)) */,
     &ceiling_op->params.f32_rnd, sizeof(ceiling_op->params.f32_rnd));
 }
 
@@ -596,6 +615,7 @@
     clamp_op,
     batch_size, input, output,
     0 /* log2(sizeof(int8_t)) */,
+    0 /* log2(sizeof(int8_t)) */,
     &clamp_op->params.s8_minmax, sizeof(clamp_op->params.s8_minmax));
 }
 
@@ -618,6 +638,7 @@
     clamp_op,
     batch_size, input, output,
     0 /* log2(sizeof(uint8_t)) */,
+    0 /* log2(sizeof(uint8_t)) */,
     &clamp_op->params.u8_minmax, sizeof(clamp_op->params.u8_minmax));
 }
 
@@ -640,9 +661,33 @@
     clamp_op,
     batch_size, input, output,
     2 /* log2(sizeof(float)) */,
+    2 /* log2(sizeof(float)) */,
     &clamp_op->params.f32_minmax, sizeof(clamp_op->params.f32_minmax));
 }
 
+enum xnn_status xnn_setup_convert_nc_f16_f32(
+  xnn_operator_t convert_op,
+  size_t batch_size,
+  const void* input,
+  float* output,
+  pthreadpool_t threadpool)
+{
+  if (convert_op->type != xnn_operator_type_convert_nc_f16_f32) {
+    xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)",
+      xnn_operator_type_to_string(xnn_operator_type_convert_nc_f16_f32),
+      xnn_operator_type_to_string(convert_op->type));
+    return xnn_status_invalid_parameter;
+  }
+  convert_op->state = xnn_run_state_invalid;
+
+  return setup_unary_elementwise_nc(
+    convert_op,
+    batch_size, input, output,
+    1 /* log2(sizeof(uint16_t)) */,
+    2 /* log2(sizeof(float)) */,
+    NULL, 0);
+}
+
 enum xnn_status xnn_setup_copy_nc_x32(
     xnn_operator_t copy_op,
     size_t batch_size,
@@ -662,6 +707,7 @@
     copy_op,
     batch_size, input, output,
     2 /* log2(sizeof(uint32_t)) */,
+    2 /* log2(sizeof(uint32_t)) */,
     NULL, 0);
 }
 
@@ -684,6 +730,7 @@
     elu_op,
     batch_size, input, output,
     2 /* log2(sizeof(float)) */,
+    2 /* log2(sizeof(float)) */,
     &elu_op->params.f32_elu, sizeof(elu_op->params.f32_elu));
 }
 
@@ -706,6 +753,7 @@
     floor_op,
     batch_size, input, output,
     2 /* log2(sizeof(float)) */,
+    2 /* log2(sizeof(float)) */,
     &floor_op->params.f32_rnd, sizeof(floor_op->params.f32_rnd));
 }
 
@@ -728,6 +776,7 @@
     hardswish_op,
     batch_size, input, output,
     1 /* log2(sizeof(half)) */,
+    1 /* log2(sizeof(half)) */,
     &hardswish_op->params.f16_hswish, sizeof(hardswish_op->params.f16_hswish));
 }
 
@@ -750,6 +799,7 @@
     hardswish_op,
     batch_size, input, output,
     2 /* log2(sizeof(float)) */,
+    2 /* log2(sizeof(float)) */,
     &hardswish_op->params.f32_hswish, sizeof(hardswish_op->params.f32_hswish));
 }
 
@@ -772,6 +822,7 @@
     leaky_relu_op,
     batch_size, input, output,
     2 /* log2(sizeof(float)) */,
+    2 /* log2(sizeof(float)) */,
     &leaky_relu_op->params.f32_lrelu, sizeof(leaky_relu_op->params.f32_lrelu));
 }
 
@@ -794,6 +845,7 @@
     negate_op,
     batch_size, input, output,
     2 /* log2(sizeof(float)) */,
+    2 /* log2(sizeof(float)) */,
     &negate_op->params.f32_neg, sizeof(negate_op->params.f32_neg));
 }
 
@@ -816,6 +868,7 @@
     sigmoid_op,
     batch_size, input, output,
     2 /* log2(sizeof(float)) */,
+    2 /* log2(sizeof(float)) */,
     NULL, 0);
 }
 
@@ -838,6 +891,7 @@
     square_op,
     batch_size, input, output,
     2 /* log2(sizeof(float)) */,
+    2 /* log2(sizeof(float)) */,
     NULL, 0);
 }
 
@@ -860,6 +914,7 @@
     sqrt_op,
     batch_size, input, output,
     2 /* log2(sizeof(float)) */,
+    2 /* log2(sizeof(float)) */,
     NULL, 0);
 }
 
@@ -882,5 +937,6 @@
     truncation_op,
     batch_size, input, output,
     2 /* log2(sizeof(float)) */,
+    2 /* log2(sizeof(float)) */,
     &truncation_op->params.f32_rnd, sizeof(truncation_op->params.f32_rnd));
 }
\ No newline at end of file
diff --git a/src/xnnpack/operator.h b/src/xnnpack/operator.h
index 549367e..a8b835d 100644
--- a/src/xnnpack/operator.h
+++ b/src/xnnpack/operator.h
@@ -49,6 +49,7 @@
   xnn_operator_type_ceiling_nc_f32,
   xnn_operator_type_constant_pad_nd_x8,
   xnn_operator_type_constant_pad_nd_x32,
+  xnn_operator_type_convert_nc_f16_f32,
   xnn_operator_type_convolution_nchw_f32,
   xnn_operator_type_convolution_nhwc_f16,
   xnn_operator_type_convolution_nhwc_f32,
diff --git a/src/xnnpack/params.h b/src/xnnpack/params.h
index 169e492..4e8c724 100644
--- a/src/xnnpack/params.h
+++ b/src/xnnpack/params.h
@@ -2600,8 +2600,10 @@
 #define XNN_INIT_FLAG_X8      0x00000400
 // Indicates that XX XNNPACK microkernels are available for use.
 #define XNN_INIT_FLAG_XX      0x00000800
+// Indicates that VCVT XNNPACK microkernels are available for use.
+#define XNN_INIT_FLAG_VCVT    0x00001000
 // Indicates that CHW XNNPACK microkernels are optimized for the host platform.
-#define XNN_INIT_FLAG_CHW_OPT 0x00001000
+#define XNN_INIT_FLAG_CHW_OPT 0x00002000
 
 struct xnn_parameters {
   // Bitwise combination of XNN_INIT_FLAG_* flags
@@ -2708,6 +2710,9 @@
     struct ibilinear_chw_parameters ibilinear_chw;
   } f32;
   struct {
+    xnn_univector_ukernel_function f16_to_f32;
+  } vcvt;
+  struct {
     xnn_unpool_ukernel_function unpool;
     struct zip_parameters zip;
     // Depth To Space 2D with CHW->HWC layout conversion.
diff --git a/test/convert-nc.cc b/test/convert-nc.cc
new file mode 100644
index 0000000..aa9d417
--- /dev/null
+++ b/test/convert-nc.cc
@@ -0,0 +1,63 @@
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <gtest/gtest.h>
+
+#include "convert-operator-tester.h"
+
+
+TEST(CONVERT_NC_F16_F32, unit_batch) {
+  for (size_t channels = 1; channels < 100; channels++) {
+    ConvertOperatorTester()
+      .batch_size(1)
+      .channels(channels)
+      .iterations(3)
+      .TestF16toF32();
+  }
+}
+
+TEST(CONVERT_NC_F16_F32, small_batch) {
+  for (size_t channels = 1; channels < 100; channels++) {
+    ConvertOperatorTester()
+      .batch_size(3)
+      .channels(channels)
+      .iterations(3)
+      .TestF16toF32();
+  }
+}
+
+TEST(CONVERT_NC_F16_F32, small_batch_with_input_stride) {
+  for (size_t channels = 1; channels < 100; channels += 15) {
+    ConvertOperatorTester()
+      .batch_size(3)
+      .channels(channels)
+      .input_stride(129)
+      .iterations(3)
+      .TestF16toF32();
+  }
+}
+
+TEST(CONVERT_NC_F16_F32, small_batch_with_output_stride) {
+  for (size_t channels = 1; channels < 100; channels += 15) {
+    ConvertOperatorTester()
+      .batch_size(3)
+      .channels(channels)
+      .output_stride(117)
+      .iterations(3)
+      .TestF16toF32();
+  }
+}
+
+TEST(CONVERT_NC_F16_F32, small_batch_with_input_and_output_stride) {
+  for (size_t channels = 1; channels < 100; channels += 15) {
+    ConvertOperatorTester()
+      .batch_size(3)
+      .channels(channels)
+      .input_stride(129)
+      .output_stride(117)
+      .iterations(3)
+      .TestF16toF32();
+  }
+}
diff --git a/test/convert-operator-tester.h b/test/convert-operator-tester.h
new file mode 100644
index 0000000..ff33e17
--- /dev/null
+++ b/test/convert-operator-tester.h
@@ -0,0 +1,145 @@
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstddef>
+#include <cstdlib>
+#include <functional>
+#include <random>
+#include <vector>
+
+#include <fp16.h>
+
+#include <xnnpack.h>
+
+
+class ConvertOperatorTester {
+ public:
+  inline ConvertOperatorTester& channels(size_t channels) {
+    assert(channels != 0);
+    this->channels_ = channels;
+    return *this;
+  }
+
+  inline size_t channels() const {
+    return this->channels_;
+  }
+
+  inline ConvertOperatorTester& input_stride(size_t input_stride) {
+    assert(input_stride != 0);
+    this->input_stride_ = input_stride;
+    return *this;
+  }
+
+  inline size_t input_stride() const {
+    if (this->input_stride_ == 0) {
+      return this->channels_;
+    } else {
+      assert(this->input_stride_ >= this->channels_);
+      return this->input_stride_;
+    }
+  }
+
+  inline ConvertOperatorTester& output_stride(size_t output_stride) {
+    assert(output_stride != 0);
+    this->output_stride_ = output_stride;
+    return *this;
+  }
+
+  inline size_t output_stride() const {
+    if (this->output_stride_ == 0) {
+      return this->channels_;
+    } else {
+      assert(this->output_stride_ >= this->channels_);
+      return this->output_stride_;
+    }
+  }
+
+  inline ConvertOperatorTester& batch_size(size_t batch_size) {
+    assert(batch_size != 0);
+    this->batch_size_ = batch_size;
+    return *this;
+  }
+
+  inline size_t batch_size() const {
+    return this->batch_size_;
+  }
+
+  inline ConvertOperatorTester& iterations(size_t iterations) {
+    this->iterations_ = iterations;
+    return *this;
+  }
+
+  inline size_t iterations() const {
+    return this->iterations_;
+  }
+
+  void TestF16toF32() const {
+    std::random_device random_device;
+    auto rng = std::mt19937(random_device());
+    auto f32rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), rng);
+    auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
+
+    std::vector<uint16_t> input(XNN_EXTRA_BYTES / sizeof(uint16_t) +
+      (batch_size() - 1) * input_stride() + channels());
+    std::vector<float> output((batch_size() - 1) * output_stride() + channels());
+    std::vector<float> output_ref(batch_size() * channels());
+    for (size_t iteration = 0; iteration < iterations(); iteration++) {
+      std::generate(input.begin(), input.end(), std::ref(f16rng));
+      std::fill(output.begin(), output.end(), std::nanf(""));
+
+      // Compute reference results.
+      for (size_t i = 0; i < batch_size(); i++) {
+        for (size_t c = 0; c < channels(); c++) {
+          output_ref[i * channels() + c] = fp16_ieee_to_fp32_value(input[i * input_stride() + c]);
+        }
+      }
+
+      // Create, setup, run, and destroy Convert operator.
+      ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+      xnn_operator_t convert_op = nullptr;
+
+      ASSERT_EQ(xnn_status_success,
+        xnn_create_convert_nc_f16_f32(
+          channels(), input_stride(), output_stride(),
+          0, &convert_op));
+      ASSERT_NE(nullptr, convert_op);
+
+      // Smart pointer to automatically delete convert op.
+      std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_ceiling_op(convert_op, xnn_delete_operator);
+
+      ASSERT_EQ(xnn_status_success,
+        xnn_setup_convert_nc_f16_f32(
+          convert_op,
+          batch_size(),
+          input.data(), output.data(),
+          nullptr /* thread pool */));
+
+      ASSERT_EQ(xnn_status_success,
+        xnn_run_operator(convert_op, nullptr /* thread pool */));
+
+      // Verify results.
+      for (size_t i = 0; i < batch_size(); i++) {
+        for (size_t c = 0; c < channels(); c++) {
+          ASSERT_EQ(output_ref[i * channels() + c], output[i * output_stride() + c])
+            << "at batch " << i << " / " << batch_size() << ", channel " << c << " / " << channels();
+        }
+      }
+    }
+  }
+
+ private:
+  size_t batch_size_{1};
+  size_t channels_{1};
+  size_t input_stride_{0};
+  size_t output_stride_{0};
+  size_t iterations_{15};
+};