update to new tensorflow

2025-12-08 19:06:15 +00:00 · 2020-08-20 09:53:40 +01:00 · 2020-08-20 09:53:40 +01:00 · b216c275aa
commit b216c275aa
parent 7ac9ed85f7
110 changed files with 7905 additions and 4007 deletions
--- a/libs/tensorflow/jswrap_tensorflow.c
+++ b/libs/tensorflow/jswrap_tensorflow.c
@ -99,15 +99,16 @@ void *jswrap_tfmicrointerpreter_getTFMI(JsVar *parent) {
  return tfPtr;
 }

-JsVar *jswrap_tfmicrointerpreter_tensorToArrayBuffer(JsVar *parent, TfLiteTensor *tensor) {
+JsVar *jswrap_tfmicrointerpreter_tensorToArrayBuffer(JsVar *parent, bool isInput) {
  void *tfmi = jswrap_tfmicrointerpreter_getTFMI(parent);
  JsVar *mi = jsvObjectGetChild(parent, "mi", 0);
-  if (!tensor && !mi) {
+  tf_tensorfinfo tensor = tf_get(tfmi, isInput);
+  if (!tensor.data || !mi) {
    jsExceptionHere(JSET_ERROR, "Unable to get tensor");
    return 0;
  }
  JsVarDataArrayBufferViewType abType = ARRAYBUFFERVIEW_UNDEFINED;
-  switch (tensor->type) {
+  switch (tensor.type) {
  case kTfLiteFloat32 :
    abType = ARRAYBUFFERVIEW_FLOAT32; break;
  case kTfLiteInt32 :
@ -119,12 +120,12 @@ JsVar *jswrap_tfmicrointerpreter_tensorToArrayBuffer(JsVar *parent, TfLiteTensor
  case kTfLiteInt8 :
    abType = ARRAYBUFFERVIEW_INT8; break;
  default:
-    jsExceptionHere(JSET_TYPEERROR, "Unsupported Tensor format TfLiteType:%d", tensor->type);
+    jsExceptionHere(JSET_TYPEERROR, "Unsupported Tensor format TfLiteType:%d", tensor.type);
    return 0;
  }

  JsVar *ab = jsvNewArrayBufferFromString(mi,0);
-  JsVar *b = jswrap_typedarray_constructor(abType, ab, ((size_t)&tensor->data.f[0])-(size_t)tfmi, tensor->bytes / JSV_ARRAYBUFFER_GET_SIZE(abType));
+  JsVar *b = jswrap_typedarray_constructor(abType, ab, ((size_t)tensor.data)-(size_t)tfmi, tensor.bytes / JSV_ARRAYBUFFER_GET_SIZE(abType));
  jsvUnLock2(ab,mi);
  return b;
 }
@ -138,9 +139,7 @@ JsVar *jswrap_tfmicrointerpreter_tensorToArrayBuffer(JsVar *parent, TfLiteTensor
 }
 */
 JsVar *jswrap_tfmicrointerpreter_getInput(JsVar *parent) {
-  void *tfmi = jswrap_tfmicrointerpreter_getTFMI(parent);
-  if (!tfmi) return 0;
-  return jswrap_tfmicrointerpreter_tensorToArrayBuffer(parent, tf_get_input(tfmi, 0));
+  return jswrap_tfmicrointerpreter_tensorToArrayBuffer(parent, true);
 }
 /*JSON{
  "type" : "method",
@ -152,9 +151,7 @@ JsVar *jswrap_tfmicrointerpreter_getInput(JsVar *parent) {
 }
 */
 JsVar *jswrap_tfmicrointerpreter_getOutput(JsVar *parent) {
-  void *tfmi = jswrap_tfmicrointerpreter_getTFMI(parent);
-  if (!tfmi) return 0;
-  return jswrap_tfmicrointerpreter_tensorToArrayBuffer(parent, tf_get_output(tfmi, 0));
+  return jswrap_tfmicrointerpreter_tensorToArrayBuffer(parent, false);
 }
 /*JSON{
  "type" : "method",
--- a/libs/tensorflow/tensorflow.cc
+++ b/libs/tensorflow/tensorflow.cc
@ -53,7 +53,7 @@ typedef struct {
  tflite::ops::micro::AllOpsResolver resolver;
 #else
 #define TENSORFLOW_OP_COUNT 6
-  tflite::MicroOpResolver<TENSORFLOW_OP_COUNT> resolver;
+  tflite::MicroMutableOpResolver<TENSORFLOW_OP_COUNT> resolver;
 #endif
  // Build an interpreter to run the model with
  tflite::MicroInterpreter interpreter;
@ -88,19 +88,13 @@ bool tf_create(void *dataPtr, size_t arena_size, const char *model_data) {
  new (&tf->resolver)tflite::ops::micro::AllOpsResolver();
 #else
  // Pull in only the operation implementations we need.
-  new (&tf->resolver)tflite::MicroOpResolver<TENSORFLOW_OP_COUNT>();
-  tf->resolver.AddBuiltin( tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
-                           tflite::ops::micro::Register_DEPTHWISE_CONV_2D(), tflite::MicroOpResolverAnyVersion());
-  tf->resolver.AddBuiltin( tflite::BuiltinOperator_CONV_2D,
-                           tflite::ops::micro::Register_CONV_2D(), tflite::MicroOpResolverAnyVersion());
-  tf->resolver.AddBuiltin( tflite::BuiltinOperator_AVERAGE_POOL_2D,
-                           tflite::ops::micro::Register_AVERAGE_POOL_2D(), tflite::MicroOpResolverAnyVersion());
-  tf->resolver.AddBuiltin( tflite::BuiltinOperator_MAX_POOL_2D,
-                           tflite::ops::micro::Register_MAX_POOL_2D(), tflite::MicroOpResolverAnyVersion());
-  tf->resolver.AddBuiltin( tflite::BuiltinOperator_FULLY_CONNECTED,
-                           tflite::ops::micro::Register_FULLY_CONNECTED(), tflite::MicroOpResolverAnyVersion());
-  tf->resolver.AddBuiltin( tflite::BuiltinOperator_SOFTMAX,
-                           tflite::ops::micro::Register_SOFTMAX(), tflite::MicroOpResolverAnyVersion());
+  new (&tf->resolver)tflite::MicroMutableOpResolver<TENSORFLOW_OP_COUNT>();
+  tf->resolver.AddDepthwiseConv2D();
+  tf->resolver.AddConv2D();
+  tf->resolver.AddAveragePool2D();
+  tf->resolver.AddMaxPool2D();
+  tf->resolver.AddFullyConnected();
+  tf->resolver.AddSoftmax();
 #endif

  // Build an interpreter to run the model with
@ -133,16 +127,17 @@ bool tf_invoke(void *dataPtr) {
  return true;
 }

-TfLiteTensor *tf_get_input(void *dataPtr, int n) {
+tf_tensorfinfo tf_get(void *dataPtr, bool isInput) {
  TFData *tf = (TFData*)dataPtr;
-  // Obtain pointers to the model's input and output tensors
-  return tf->interpreter.input(0);
-}
-
-TfLiteTensor *tf_get_output(void *dataPtr, int n) {
-  TFData *tf = (TFData*)dataPtr;
-  // Obtain pointers to the model's input and output tensors
-  return tf->interpreter.output(0);
+  tf_tensorfinfo inf;
+  inf.data = 0;
+  TfLiteTensor *tensor = isInput ? tf->interpreter.input(0) : tf->interpreter.output(0);
+  if (tensor) {
+    inf.type = tensor->type;
+    inf.data = &tensor->data.f[0];
+    inf.bytes = tensor->bytes;
+  }
+  return inf;
 }

 } // extern "C"
--- a/libs/tensorflow/tensorflow.h
+++ b/libs/tensorflow/tensorflow.h
@ -1,8 +1,14 @@
 #include "tensorflow/lite/c/common.h"

+typedef struct {
+  void *data; // pointer to data
+  TfLiteType type; // data type
+  int bytes; // how big is the tensor
+} tf_tensorfinfo;
+
+
 size_t tf_get_size(size_t arena_size, const char *model_data);
 bool tf_create(void *dataPtr, size_t arena_size, const char *model_data);
 void tf_destroy(void *dataPtr);
 bool tf_invoke(void *dataPtr);
-TfLiteTensor *tf_get_input(void *dataPtr, int n);
-TfLiteTensor *tf_get_output(void *dataPtr, int n);
+tf_tensorfinfo tf_get(void *dataPtr, bool isInput);
--- a/libs/tensorflow/tensorflow/core/public/version.h
+++ b/libs/tensorflow/tensorflow/core/public/version.h
@ -21,7 +21,7 @@ limitations under the License.
 // Also update tensorflow/tensorflow.bzl and
 // tensorflow/tools/pip_package/setup.py
 #define TF_MAJOR_VERSION 2
-#define TF_MINOR_VERSION 1
+#define TF_MINOR_VERSION 4
 #define TF_PATCH_VERSION 0

 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
@ -108,7 +108,7 @@ limitations under the License.

 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 398  // Updated: 2020/5/11
+#define TF_GRAPH_DEF_VERSION 498  // Updated: 2020/8/19

 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //
--- a/libs/tensorflow/tensorflow/lite/c/builtin_op_data.h
+++ b/libs/tensorflow/tensorflow/lite/c/builtin_op_data.h
@ -67,8 +67,9 @@ typedef struct {
 typedef enum {
  kTfLiteActNone = 0,
  kTfLiteActRelu,
-  kTfLiteActRelu1,  // min(max(-1, x), 1)
-  kTfLiteActRelu6,  // min(max(0, x), 6)
+  kTfLiteActReluN1To1,                    // min(max(-1, x), 1)
+  kTfLiteActRelu1 = kTfLiteActReluN1To1,  // kTfLiteActRelu1 will be deprecated.
+  kTfLiteActRelu6,                        // min(max(0, x), 6)
  kTfLiteActTanh,
  kTfLiteActSignBit,
  kTfLiteActSigmoid,
@ -198,6 +199,8 @@ typedef struct {

 typedef struct {
  TfLiteFusedActivation activation;
+  // Parameter added for the version 4.
+  bool pot_scale_int16;
 } TfLiteAddParams;

 typedef struct {
@ -219,6 +222,8 @@ typedef struct {

 typedef struct {
  TfLiteFusedActivation activation;
+  // Parameter added for the version 5.
+  bool pot_scale_int16;
 } TfLiteSubParams;

 typedef struct {
--- a/libs/tensorflow/tensorflow/lite/c/common.c
+++ b/libs/tensorflow/tensorflow/lite/c/common.c
@ -79,7 +79,8 @@ TfLiteFloatArray* TfLiteFloatArrayCreate(int size) {
 void TfLiteFloatArrayFree(TfLiteFloatArray* a) { free(a); }

 void TfLiteTensorDataFree(TfLiteTensor* t) {
-  if (t->allocation_type == kTfLiteDynamic) {
+  if (t->allocation_type == kTfLiteDynamic ||
+      t->allocation_type == kTfLitePersistentRo) {
    free(t->data.raw);
  }
  t->data.raw = NULL;
@ -172,7 +173,8 @@ void TfLiteTensorReset(TfLiteType type, const char* name, TfLiteIntArray* dims,
 }

 void TfLiteTensorRealloc(size_t num_bytes, TfLiteTensor* tensor) {
-  if (tensor->allocation_type != kTfLiteDynamic) {
+  if (tensor->allocation_type != kTfLiteDynamic &&
+      tensor->allocation_type != kTfLitePersistentRo) {
    return;
  }
  // TODO(b/145340303): Tensor data should be aligned.
@ -205,6 +207,8 @@ const char* TfLiteTypeGetName(TfLiteType type) {
      return "BOOL";
    case kTfLiteComplex64:
      return "COMPLEX64";
+    case kTfLiteComplex128:
+      return "COMPLEX128";
    case kTfLiteString:
      return "STRING";
    case kTfLiteFloat16:
--- a/libs/tensorflow/tensorflow/lite/c/common.h
+++ b/libs/tensorflow/tensorflow/lite/c/common.h
@ -47,7 +47,8 @@ extern "C" {
 typedef enum TfLiteStatus {
  kTfLiteOk = 0,
  kTfLiteError = 1,
-  kTfLiteDelegateError = 2
+  kTfLiteDelegateError = 2,
+  kTfLiteApplicationError = 3
 } TfLiteStatus;

 // The list of external context types known to TF Lite. This list exists solely
@ -58,7 +59,7 @@ typedef enum TfLiteExternalContextType {
  kTfLiteEigenContext = 0,       // include eigen_support.h to use.
  kTfLiteGemmLowpContext = 1,    // include gemm_support.h to use.
  kTfLiteEdgeTpuContext = 2,     // Placeholder for Edge TPU support.
-  kTfLiteCpuBackendContext = 3,  // include cpu_backend_support.h to use.
+  kTfLiteCpuBackendContext = 3,  // include cpu_backend_context.h to use.
  kTfLiteMaxExternalContexts = 4
 } TfLiteExternalContextType;

@ -86,8 +87,9 @@ typedef struct TfLiteIntArray {
  int size;
 // gcc 6.1+ have a bug where flexible members aren't properly handled
 // https://github.com/google/re2/commit/b94b7cd42e9f02673cd748c1ac1d16db4052514c
-#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ == 6 && \
-    __GNUC_MINOR__ >= 1
+#if (!defined(__clang__) && defined(__GNUC__) && __GNUC__ == 6 && \
+     __GNUC_MINOR__ >= 1) ||                                      \
+    defined(HEXAGON)
  int data[0];
 #else
  int data[];
@ -125,6 +127,7 @@ typedef struct TfLiteFloatArray {
  int size;
 // gcc 6.1+ have a bug where flexible members aren't properly handled
 // https://github.com/google/re2/commit/b94b7cd42e9f02673cd748c1ac1d16db4052514c
+// This also applies to the toolchain used for Qualcomm Hexagon DSPs.
 #if !defined(__clang__) && defined(__GNUC__) && __GNUC__ == 6 && \
    __GNUC_MINOR__ >= 1
  float data[0];
@ -203,6 +206,7 @@ void TfLiteFloatArrayFree(TfLiteFloatArray* a);
 // the current function, while also reporting the location of the error.
 // `a` and `b` may be evaluated more than once, so no side effects or
 // extremely expensive computations should be done.
+// NOTE: Use TF_LITE_ENSURE_TYPES_EQ if comparing TfLiteTypes.
 #define TF_LITE_ENSURE_EQ(context, a, b)                                   \
  do {                                                                     \
    if ((a) != (b)) {                                                      \
@ -230,11 +234,32 @@ void TfLiteFloatArrayFree(TfLiteFloatArray* a);
    }                                      \
  } while (0)

+// Define TFL_CAPI_EXPORT macro to export a function properly with a shared
+// library.
+#ifdef SWIG
+#define TFL_CAPI_EXPORT
+#else
+#if defined(_WIN32)
+#ifdef TFL_COMPILE_LIBRARY
+#define TFL_CAPI_EXPORT __declspec(dllexport)
+#else
+#define TFL_CAPI_EXPORT __declspec(dllimport)
+#endif  // TFL_COMPILE_LIBRARY
+#else
+#define TFL_CAPI_EXPORT __attribute__((visibility("default")))
+#endif  // _WIN32
+#endif  // SWIG
+
 // Single-precision complex data type compatible with the C99 definition.
 typedef struct TfLiteComplex64 {
  float re, im;  // real and imaginary parts, respectively.
 } TfLiteComplex64;

+// Double-precision complex data type compatible with the C99 definition.
+typedef struct TfLiteComplex128 {
+  double re, im;  // real and imaginary parts, respectively.
+} TfLiteComplex128;
+
 // Half precision data type compatible with the C99 definition.
 typedef struct TfLiteFloat16 {
  uint16_t data;
@ -254,6 +279,7 @@ typedef enum {
  kTfLiteInt8 = 9,
  kTfLiteFloat16 = 10,
  kTfLiteFloat64 = 11,
+  kTfLiteComplex128 = 12,
 } TfLiteType;

 // Return the name of a given type, for error reporting purposes.
@ -310,26 +336,39 @@ typedef union TfLitePtrUnion {
  int64_t* i64;
  float* f;
  TfLiteFloat16* f16;
+  double* f64;
  char* raw;
  const char* raw_const;
  uint8_t* uint8;
  bool* b;
  int16_t* i16;
  TfLiteComplex64* c64;
+  TfLiteComplex128* c128;
  int8_t* int8;
  /* Only use this member. */
  void* data;
 } TfLitePtrUnion;

-// Memory allocation strategies. kTfLiteMmapRo is for read-only memory-mapped
-// data (or data externally allocated). kTfLiteArenaRw is arena allocated
-// data. kTfLiteDynamic is for tensors that are allocated during evaluation.
+// Memory allocation strategies.
+//  * kTfLiteMmapRo: Read-only memory-mapped data, or data externally allocated.
+//  * kTfLiteArenaRw: Arena allocated with no guarantees about persistence,
+//        and available during eval.
+//  * kTfLiteArenaRwPersistent: Arena allocated but persistent across eval, and
+//        only available during eval.
+//  * kTfLiteDynamic: Allocated during eval, or for string tensors.
+//  * kTfLitePersistentRo: Allocated and populated during prepare. This is
+//        useful for tensors that can be computed during prepare and treated
+//        as constant inputs for downstream ops (also in prepare).
+//  * kTfLiteCustom: Custom memory allocation provided by the user. See
+//        TfLiteCustomAllocation below.
 typedef enum TfLiteAllocationType {
  kTfLiteMemNone = 0,
  kTfLiteMmapRo,
  kTfLiteArenaRw,
  kTfLiteArenaRwPersistent,
  kTfLiteDynamic,
+  kTfLitePersistentRo,
+  kTfLiteCustom,
 } TfLiteAllocationType;

 // The delegates should use zero or positive integers to represent handles.
@ -362,8 +401,18 @@ typedef struct TfLiteSparsity {
  int dim_metadata_size;
 } TfLiteSparsity;

+// Defines a custom memory allocation not owned by the runtime.
+// `data` should be aligned to kDefaultTensorAlignment defined in
+// lite/util.h. (Currently 64 bytes)
+// NOTE: See Interpreter.SetCustomAllocationForTensor for details on usage.
+typedef struct TfLiteCustomAllocation {
+  void* data;
+  size_t bytes;
+} TfLiteCustomAllocation;
+
 // An tensor in the interpreter system which is a wrapper around a buffer of
 // data including a dimensionality (or NULL if not currently defined).
+#ifndef TF_LITE_STATIC_MEMORY
 typedef struct TfLiteTensor {
  // The data type specification for data stored in `data`. This affects
  // what member of `data` union should be used.
@ -429,31 +478,6 @@ typedef struct TfLiteTensor {
  const TfLiteIntArray* dims_signature;
 } TfLiteTensor;

-#ifndef TF_LITE_STATIC_MEMORY
-// Free data memory of tensor `t`.
-void TfLiteTensorDataFree(TfLiteTensor* t);
-
-// Free quantization data.
-void TfLiteQuantizationFree(TfLiteQuantization* quantization);
-
-// Free sparsity parameters.
-void TfLiteSparsityFree(TfLiteSparsity* sparsity);
-
-// Free memory of tensor `t`.
-void TfLiteTensorFree(TfLiteTensor* t);
-
-// Set all of a tensor's fields (and free any previously allocated data).
-void TfLiteTensorReset(TfLiteType type, const char* name, TfLiteIntArray* dims,
-                       TfLiteQuantizationParams quantization, char* buffer,
-                       size_t size, TfLiteAllocationType allocation_type,
-                       const void* allocation, bool is_variable,
-                       TfLiteTensor* tensor);
-
-// Resize the allocated data of a (dynamic) tensor. Tensors with allocation
-// types other than kTfLiteDynamic will be ignored.
-void TfLiteTensorRealloc(size_t num_bytes, TfLiteTensor* tensor);
-#endif  // TF_LITE_STATIC_MEMORY
-
 // A structure representing an instance of a node.
 // This structure only exhibits the inputs, outputs and user defined data, not
 // other features like the type.
@ -490,6 +514,130 @@ typedef struct TfLiteNode {
  // WARNING: This is an experimental interface that is subject to change.
  struct TfLiteDelegate* delegate;
 } TfLiteNode;
+#else  // defined(TF_LITE_STATIC_MEMORY)?
+// NOTE: This flag is opt-in only at compile time.
+//
+// Specific reduced TfLiteTensor struct for TF Micro runtime. This struct
+// contains only the minimum fields required to initialize and prepare a micro
+// inference graph. The fields in this struct have been ordered from
+// largest-to-smallest for optimal struct sizeof.
+//
+// This struct does not use:
+// - allocation
+// - buffer_handle
+// - data_is_stale
+// - delegate
+// - dims_signature
+// - name
+// - sparsity
+typedef struct TfLiteTensor {
+  // TODO(b/155784997): Consider consolidating these quantization fields:
+  // Quantization information. Replaces params field above.
+  TfLiteQuantization quantization;
+
+  // Quantization information.
+  TfLiteQuantizationParams params;
+
+  // A union of data pointers. The appropriate type should be used for a typed
+  // tensor based on `type`.
+  TfLitePtrUnion data;
+
+  // A pointer to a structure representing the dimensionality interpretation
+  // that the buffer should have. NOTE: the product of elements of `dims`
+  // and the element datatype size should be equal to `bytes` below.
+  TfLiteIntArray* dims;
+
+  // The number of bytes required to store the data of this Tensor. I.e.
+  // (bytes of each element) * dims[0] * ... * dims[n-1].  For example, if
+  // type is kTfLiteFloat32 and dims = {3, 2} then
+  // bytes = sizeof(float) * 3 * 2 = 4 * 3 * 2 = 24.
+  size_t bytes;
+
+  // The data type specification for data stored in `data`. This affects
+  // what member of `data` union should be used.
+  TfLiteType type;
+
+  // How memory is mapped
+  //  kTfLiteMmapRo: Memory mapped read only.
+  //  i.e. weights
+  //  kTfLiteArenaRw: Arena allocated read write memory
+  //  (i.e. temporaries, outputs).
+  TfLiteAllocationType allocation_type;
+
+  // True if the tensor is a variable.
+  bool is_variable;
+} TfLiteTensor;
+
+// Specific reduced TfLiteNode struct for TF Micro runtime. This struct contains
+// only the minimum fields required to represent a node.
+//
+// This struct does not use:
+// - delegate
+// - intermediates
+// - temporaries
+typedef struct TfLiteNode {
+  // Inputs to this node expressed as indices into the simulator's tensors.
+  TfLiteIntArray* inputs;
+
+  // Outputs to this node expressed as indices into the simulator's tensors.
+  TfLiteIntArray* outputs;
+
+  // Opaque data provided by the node implementer through `Registration.init`.
+  void* user_data;
+
+  // Opaque data provided to the node if the node is a builtin. This is usually
+  // a structure defined in builtin_op_data.h
+  void* builtin_data;
+
+  // Custom initial data. This is the opaque data provided in the flatbuffer.
+  // WARNING: This is an experimental interface that is subject to change.
+  const void* custom_initial_data;
+  int custom_initial_data_size;
+} TfLiteNode;
+#endif  // TF_LITE_STATIC_MEMORY
+
+// Light-weight tensor struct for TF Micro runtime. Provides the minimal amount
+// of information required for a kernel to run during TfLiteRegistration::Eval.
+// TODO(b/160955687): Move this field into TF_LITE_STATIC_MEMORY when TFLM
+// builds with this flag by default internally.
+typedef struct TfLiteEvalTensor {
+  // A union of data pointers. The appropriate type should be used for a typed
+  // tensor based on `type`.
+  TfLitePtrUnion data;
+
+  // A pointer to a structure representing the dimensionality interpretation
+  // that the buffer should have.
+  TfLiteIntArray* dims;
+
+  // The data type specification for data stored in `data`. This affects
+  // what member of `data` union should be used.
+  TfLiteType type;
+} TfLiteEvalTensor;
+
+#ifndef TF_LITE_STATIC_MEMORY
+// Free data memory of tensor `t`.
+void TfLiteTensorDataFree(TfLiteTensor* t);
+
+// Free quantization data.
+void TfLiteQuantizationFree(TfLiteQuantization* quantization);
+
+// Free sparsity parameters.
+void TfLiteSparsityFree(TfLiteSparsity* sparsity);
+
+// Free memory of tensor `t`.
+void TfLiteTensorFree(TfLiteTensor* t);
+
+// Set all of a tensor's fields (and free any previously allocated data).
+void TfLiteTensorReset(TfLiteType type, const char* name, TfLiteIntArray* dims,
+                       TfLiteQuantizationParams quantization, char* buffer,
+                       size_t size, TfLiteAllocationType allocation_type,
+                       const void* allocation, bool is_variable,
+                       TfLiteTensor* tensor);
+
+// Resize the allocated data of a (dynamic) tensor. Tensors with allocation
+// types other than kTfLiteDynamic will be ignored.
+void TfLiteTensorRealloc(size_t num_bytes, TfLiteTensor* tensor);
+#endif  // TF_LITE_STATIC_MEMORY

 // WARNING: This is an experimental interface that is subject to change.
 //
@ -581,12 +729,11 @@ typedef struct TfLiteContext {
  void* profiler;

  // Allocate persistent buffer which has the same life time as the interpreter.
+  // Returns nullptr on failure.
  // The memory is allocated from heap for TFL, and from tail in TFLM.
-  // If *ptr is not nullptr, the pointer will be reallocated.
-  // This method is only available in Prepare stage.
+  // This method is only available in Init or Prepare stage.
  // WARNING: This is an experimental interface that is subject to change.
-  TfLiteStatus (*AllocatePersistentBuffer)(struct TfLiteContext* ctx,
-                                           size_t bytes, void** ptr);
+  void* (*AllocatePersistentBuffer)(struct TfLiteContext* ctx, size_t bytes);

  // Allocate a buffer which will be deallocated right after invoke phase.
  // The memory is allocated from heap in TFL, and from volatile arena in TFLM.
@ -641,6 +788,18 @@ typedef struct TfLiteContext {
  TfLiteStatus (*PreviewDelegatePartitioning)(
      struct TfLiteContext* context, const TfLiteIntArray* nodes_to_replace,
      TfLiteDelegateParams** partition_params_array, int* num_partitions);
+
+  // Returns a TfLiteTensor struct for a given index.
+  // WARNING: This is an experimental interface that is subject to change.
+  // WARNING: This method may not be available on all platforms.
+  TfLiteTensor* (*GetTensor)(const struct TfLiteContext* context,
+                             int tensor_idx);
+
+  // Returns a TfLiteEvalTensor struct for a given index.
+  // WARNING: This is an experimental interface that is subject to change.
+  // WARNING: This method may not be available on all platforms.
+  TfLiteEvalTensor* (*GetEvalTensor)(const struct TfLiteContext* context,
+                                     int tensor_idx);
 } TfLiteContext;

 typedef struct TfLiteRegistration {
@ -715,7 +874,26 @@ typedef enum TfLiteDelegateFlags {
  //
  // If the delegate isn't capable to handle dynamic tensors, this flag need
  // to be set to false.
-  kTfLiteDelegateFlagsAllowDynamicTensors = 1
+  kTfLiteDelegateFlagsAllowDynamicTensors = 1,
+
+  // This flag can be used by delegates (that allow dynamic tensors) to ensure
+  // applicable tensor shapes are automatically propagated in the case of tensor
+  // resizing.
+  // This means that non-dynamic (allocation_type != kTfLiteDynamic) I/O tensors
+  // of a delegate kernel will have correct shapes before its Prepare() method
+  // is called. The runtime leverages TFLite builtin ops in the original
+  // execution plan to propagate shapes.
+  //
+  // A few points to note:
+  // 1. This requires kTfLiteDelegateFlagsAllowDynamicTensors. If that flag is
+  // false, this one is redundant since the delegate kernels are re-initialized
+  // every time tensors are resized.
+  // 2. Enabling this flag adds some overhead to AllocateTensors(), since extra
+  // work is required to prepare the original execution plan.
+  // 3. This flag requires that the original execution plan only have ops with
+  // valid registrations (and not 'dummy' custom ops like with Flex).
+  // WARNING: This feature is experimental and subject to change.
+  kTfLiteDelegateFlagsRequirePropagatedShapes = 2
 } TfLiteDelegateFlags;

 // WARNING: This is an experimental interface that is subject to change.
@ -734,8 +912,9 @@ typedef struct TfLiteDelegate {
                          struct TfLiteDelegate* delegate);

  // Copy the data from delegate buffer handle into raw memory of the given
-  // 'tensor'. This cannot be null. The delegate is allowed to allocate the raw
-  // bytes as long as it follows the rules for kTfLiteDynamic tensors.
+  // 'tensor'. Note that the delegate is allowed to allocate the raw bytes as
+  // long as it follows the rules for kTfLiteDynamic tensors, in which case this
+  // cannot be null.
  TfLiteStatus (*CopyFromBufferHandle)(TfLiteContext* context,
                                       struct TfLiteDelegate* delegate,
                                       TfLiteBufferHandle buffer_handle,
--- a/libs/tensorflow/tensorflow/lite/core/api/flatbuffer_conversions.cc
+++ b/libs/tensorflow/tensorflow/lite/core/api/flatbuffer_conversions.cc
--- a/libs/tensorflow/tensorflow/lite/core/api/flatbuffer_conversions.h
+++ b/libs/tensorflow/tensorflow/lite/core/api/flatbuffer_conversions.h
@ -19,9 +19,12 @@ limitations under the License.
 // flatbuffer serialization format into in-memory values that are used by the
 // runtime API and interpreter.

+#include <cstddef>
+#include <new>
+#include <type_traits>
+
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
-#include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/schema/schema_generated.h"

 namespace tflite {
@ -66,6 +69,185 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
 TfLiteStatus ConvertTensorType(TensorType tensor_type, TfLiteType* type,
                               ErrorReporter* error_reporter);

+TfLiteStatus ParseAbs(const Operator* op, ErrorReporter* error_reporter,
+                      BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseAdd(const Operator* op, ErrorReporter* error_reporter,
+                      BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseArgMax(const Operator* op, ErrorReporter* error_reporter,
+                         BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseArgMin(const Operator* op, ErrorReporter* error_reporter,
+                         BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseCeil(const Operator* op, ErrorReporter* error_reporter,
+                       BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseConcatenation(const Operator* op,
+                                ErrorReporter* error_reporter,
+                                BuiltinDataAllocator* allocator,
+                                void** builtin_data);
+
+TfLiteStatus ParseConv2D(const Operator* op, ErrorReporter* error_reporter,
+                         BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseCos(const Operator* op, ErrorReporter* error_reporter,
+                      BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseDepthwiseConv2D(const Operator* op,
+                                  ErrorReporter* error_reporter,
+                                  BuiltinDataAllocator* allocator,
+                                  void** builtin_data);
+
+TfLiteStatus ParseDequantize(const Operator* op, ErrorReporter* error_reporter,
+                             BuiltinDataAllocator* allocator,
+                             void** builtin_data);
+
+TfLiteStatus ParseEqual(const Operator* op, ErrorReporter* error_reporter,
+                        BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseFloor(const Operator* op, ErrorReporter* error_reporter,
+                        BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseFullyConnected(const Operator* op,
+                                 ErrorReporter* error_reporter,
+                                 BuiltinDataAllocator* allocator,
+                                 void** builtin_data);
+
+TfLiteStatus ParseGreater(const Operator* op, ErrorReporter* error_reporter,
+                          BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseGreaterEqual(const Operator* op,
+                               ErrorReporter* error_reporter,
+                               BuiltinDataAllocator* allocator,
+                               void** builtin_data);
+
+TfLiteStatus ParseHardSwish(const Operator* op, ErrorReporter* error_reporter,
+                            BuiltinDataAllocator* allocator,
+                            void** builtin_data);
+
+TfLiteStatus ParseL2Normalization(const Operator* op,
+                                  ErrorReporter* error_reporter,
+                                  BuiltinDataAllocator* allocator,
+                                  void** builtin_data);
+
+TfLiteStatus ParseLess(const Operator* op, ErrorReporter* error_reporter,
+                       BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseLessEqual(const Operator* op, ErrorReporter* error_reporter,
+                            BuiltinDataAllocator* allocator,
+                            void** builtin_data);
+
+TfLiteStatus ParseLog(const Operator* op, ErrorReporter* error_reporter,
+                      BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseLogicalAnd(const Operator* op, ErrorReporter* error_reporter,
+                             BuiltinDataAllocator* allocator,
+                             void** builtin_data);
+
+TfLiteStatus ParseLogicalNot(const Operator* op, ErrorReporter* error_reporter,
+                             BuiltinDataAllocator* allocator,
+                             void** builtin_data);
+
+TfLiteStatus ParseLogicalOr(const Operator* op, ErrorReporter* error_reporter,
+                            BuiltinDataAllocator* allocator,
+                            void** builtin_data);
+
+TfLiteStatus ParseLogistic(const Operator* op, ErrorReporter* error_reporter,
+                           BuiltinDataAllocator* allocator,
+                           void** builtin_data);
+
+TfLiteStatus ParseMaximum(const Operator* op, ErrorReporter* error_reporter,
+                          BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseMinimum(const Operator* op, ErrorReporter* error_reporter,
+                          BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseMul(const Operator* op, ErrorReporter* error_reporter,
+                      BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseNeg(const Operator* op, ErrorReporter* error_reporter,
+                      BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseNotEqual(const Operator* op, ErrorReporter* error_reporter,
+                           BuiltinDataAllocator* allocator,
+                           void** builtin_data);
+
+TfLiteStatus ParsePack(const Operator* op, ErrorReporter* error_reporter,
+                       BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParsePad(const Operator* op, ErrorReporter* error_reporter,
+                      BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParsePadV2(const Operator* op, ErrorReporter* error_reporter,
+                        BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParsePool(const Operator* op, ErrorReporter* error_reporter,
+                       BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParsePrelu(const Operator* op, ErrorReporter* error_reporter,
+                        BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseQuantize(const Operator* op, ErrorReporter* error_reporter,
+                           BuiltinDataAllocator* allocator,
+                           void** builtin_data);
+
+TfLiteStatus ParseReducer(const Operator* op, ErrorReporter* error_reporter,
+                          BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseRelu(const Operator* op, ErrorReporter* error_reporter,
+                       BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseRelu6(const Operator* op, ErrorReporter* error_reporter,
+                        BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseReshape(const Operator* op, ErrorReporter* error_reporter,
+                          BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseResizeNearestNeighbor(const Operator* op,
+                                        ErrorReporter* error_reporter,
+                                        BuiltinDataAllocator* allocator,
+                                        void** builtin_data);
+
+TfLiteStatus ParseRound(const Operator* op, ErrorReporter* error_reporter,
+                        BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseRsqrt(const Operator* op, ErrorReporter* error_reporter,
+                        BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseSin(const Operator* op, ErrorReporter* error_reporter,
+                      BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseSoftmax(const Operator* op, ErrorReporter* error_reporter,
+                          BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseSplit(const Operator* op, ErrorReporter* error_reporter,
+                        BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseSqrt(const Operator* op, ErrorReporter* error_reporter,
+                       BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseSquare(const Operator* op, ErrorReporter* error_reporter,
+                         BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseStridedSlice(const Operator* op,
+                               ErrorReporter* error_reporter,
+                               BuiltinDataAllocator* allocator,
+                               void** builtin_data);
+
+TfLiteStatus ParseSub(const Operator* op, ErrorReporter* error_reporter,
+                      BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseSvdf(const Operator* op, ErrorReporter* error_reporter,
+                       BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseTanh(const Operator* op, ErrorReporter* error_reporter,
+                       BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseUnpack(const Operator* op, ErrorReporter* error_reporter,
+                         BuiltinDataAllocator* allocator, void** builtin_data);
+
 }  // namespace tflite

 #endif  // TENSORFLOW_LITE_CORE_API_FLATBUFFER_CONVERSIONS_H_
--- a/libs/tensorflow/tensorflow/lite/core/api/op_resolver.cc
+++ b/libs/tensorflow/tensorflow/lite/core/api/op_resolver.cc
@ -15,6 +15,10 @@ limitations under the License.

 #include "tensorflow/lite/core/api/op_resolver.h"

+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+
 namespace tflite {

 TfLiteStatus GetRegistrationFromOpCode(
--- a/libs/tensorflow/tensorflow/lite/core/api/op_resolver.h
+++ b/libs/tensorflow/tensorflow/lite/core/api/op_resolver.h
@ -15,6 +15,8 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_CORE_API_OP_RESOLVER_H_
 #define TENSORFLOW_LITE_CORE_API_OP_RESOLVER_H_

+#include <vector>
+
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/schema/schema_generated.h"
@ -32,6 +34,16 @@ class OpResolver {
  /// Finds the op registration of a custom operator by op name.
  virtual const TfLiteRegistration* FindOp(const char* op,
                                           int version) const = 0;
+
+  // Returns optional delegates for resolving and handling ops in the flatbuffer
+  // model. This may be used in addition to the standard TfLiteRegistration
+  // lookup for graph resolution.
+  using TfLiteDelegatePtrVector =
+      std::vector<std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>>;
+  virtual TfLiteDelegatePtrVector GetDelegates(int num_threads) const {
+    return TfLiteDelegatePtrVector();
+  }
+
  virtual ~OpResolver() {}
 };

--- a/libs/tensorflow/tensorflow/lite/core/api/tensor_utils.cc
+++ b/libs/tensorflow/tensorflow/lite/core/api/tensor_utils.cc
@ -17,6 +17,8 @@ limitations under the License.

 #include <string.h>

+#include "tensorflow/lite/c/common.h"
+
 namespace tflite {

 TfLiteStatus ResetVariableTensor(TfLiteTensor* tensor) {
--- a/libs/tensorflow/tensorflow/lite/kernels/internal/common.h
+++ b/libs/tensorflow/tensorflow/lite/kernels/internal/common.h
@ -55,9 +55,12 @@ inline void GetActivationMinMax(FusedActivationFunctionType ac,
  }
 }

-inline float ActivationFunctionWithMinMax(float x, float output_activation_min,
-                                          float output_activation_max) {
-  return std::min(std::max(x, output_activation_min), output_activation_max);
+template <typename T>
+inline T ActivationFunctionWithMinMax(T x, T output_activation_min,
+                                      T output_activation_max) {
+  using std::max;
+  using std::min;
+  return min(max(x, output_activation_min), output_activation_max);
 }

 // Legacy function, left for compatibility only.
@ -135,23 +138,24 @@ inline void BiasAndClamp(float clamp_min, float clamp_max, int bias_size,
 #endif
 }

-inline int32 MultiplyByQuantizedMultiplierSmallerThanOneExp(
-    int32 x, int32 quantized_multiplier, int left_shift) {
+inline int32_t MultiplyByQuantizedMultiplierSmallerThanOneExp(
+    int32_t x, int32_t quantized_multiplier, int left_shift) {
  using gemmlowp::RoundingDivideByPOT;
  using gemmlowp::SaturatingRoundingDoublingHighMul;
  return RoundingDivideByPOT(
      SaturatingRoundingDoublingHighMul(x, quantized_multiplier), -left_shift);
 }

-inline int32 MultiplyByQuantizedMultiplierGreaterThanOne(
-    int32 x, int32 quantized_multiplier, int left_shift) {
+inline int32_t MultiplyByQuantizedMultiplierGreaterThanOne(
+    int32_t x, int32_t quantized_multiplier, int left_shift) {
  using gemmlowp::SaturatingRoundingDoublingHighMul;
  return SaturatingRoundingDoublingHighMul(x * (1 << left_shift),
                                           quantized_multiplier);
 }

-inline int32 MultiplyByQuantizedMultiplier(int32 x, int32 quantized_multiplier,
-                                           int shift) {
+inline int32_t MultiplyByQuantizedMultiplier(int32_t x,
+                                             int32_t quantized_multiplier,
+                                             int shift) {
  using gemmlowp::RoundingDivideByPOT;
  using gemmlowp::SaturatingRoundingDoublingHighMul;
  int left_shift = shift > 0 ? shift : 0;
@ -161,16 +165,16 @@ inline int32 MultiplyByQuantizedMultiplier(int32 x, int32 quantized_multiplier,
                             right_shift);
 }

-inline int32 MultiplyByQuantizedMultiplier(int64_t x,
-                                           int32 quantized_multiplier,
-                                           int shift) {
+inline int32_t MultiplyByQuantizedMultiplier(int64_t x,
+                                             int32_t quantized_multiplier,
+                                             int shift) {
  // Inputs:
  // - quantized_multiplier has fixed point at bit 31
  // - shift is -31 to +7 (negative for right shift)
  //
  // Assumptions: The following input ranges are assumed
  // - quantize_scale>=0  (the usual range is (1<<30) to (1>>31)-1)
-  // - scaling is chosen so final scaled result fits in int32
+  // - scaling is chosen so final scaled result fits in int32_t
  // - input x is in the range -(1<<47) <= x < (1<<47)
  assert(quantized_multiplier >= 0);
  assert(shift >= -31 && shift < 8);
@ -215,9 +219,9 @@ inline int CountLeadingSignBits(T integer_input) {
  using U = typename std::make_unsigned<T>::type;
  return integer_input >= 0
             ? CountLeadingZeros(static_cast<U>(integer_input)) - 1
-             : integer_input != std::numeric_limits<T>::min()
-                   ? CountLeadingZeros(2 * static_cast<U>(-integer_input) - 1)
-                   : 0;
+         : integer_input != std::numeric_limits<T>::min()
+             ? CountLeadingZeros(2 * static_cast<U>(-integer_input) - 1)
+             : 0;
 #endif
 }

@ -259,7 +263,7 @@ inline void gen_lut(const std::function<double(double)>& func, double min,
      std::min(std::max(TfLiteRound(func(max) * 32768.0), -32768.0), 32767.0);
 }

-// int16 func table lookup, e.g., lookup exp() and 1/(1+x) used in softmax
+// int16_t func table lookup, e.g., lookup exp() and 1/(1+x) used in softmax
 inline int16_t generic_int16_table_lookup(int16_t value, const int16_t* lut) {
  // 512 base value, lut[513] only for calculate slope
  uint16_t index = static_cast<uint16_t>(256 + (value >> 7));
@ -410,6 +414,23 @@ SaturatingRoundingMultiplyByPOTParam(
      SaturatingRoundingMultiplyByPOTParam(a.raw(), exponent));
 }

+// Convert int32_t multiplier to int16_t with rounding.
+inline void DownScaleInt32ToInt16Multiplier(int32_t multiplier_int32_t,
+                                            int16_t* multiplier_int16_t) {
+  TFLITE_DCHECK_GE(multiplier_int32_t, 0);
+  static constexpr int32_t kRoundingOffset = 1 << 15;
+  if (multiplier_int32_t >=
+      std::numeric_limits<int32_t>::max() - kRoundingOffset) {
+    *multiplier_int16_t = std::numeric_limits<int16_t>::max();
+    return;
+  }
+  const int32_t result = (multiplier_int32_t + kRoundingOffset) >> 16;
+  TFLITE_DCHECK_LE(result << 16, multiplier_int32_t + kRoundingOffset);
+  TFLITE_DCHECK_GT(result << 16, multiplier_int32_t - kRoundingOffset);
+  *multiplier_int16_t = result;
+  TFLITE_DCHECK_EQ(*multiplier_int16_t, result);
+}
+
 // Minimum output bits to accommodate log of maximum input range.  It actually
 // does not matter if one considers, say, [-64,64] or [-64,64).
 //
@ -418,15 +439,13 @@ SaturatingRoundingMultiplyByPOTParam(
 //  ceil(log(abs( log(2.^(0:127))+1 ))/log(2)); ...
 //  ceil(log(abs( log(2.^(0:127))+1 ))/log(2))]
 constexpr int min_log_x_output_bits(int input_bits) {
-  return input_bits > 90
-             ? 7
-             : input_bits > 44
-                   ? 6
-                   : input_bits > 21
-                         ? 5
-                         : input_bits > 10
-                               ? 4
-                               : input_bits > 4 ? 3 : input_bits > 1 ? 2 : 1;
+  return input_bits > 90   ? 7
+         : input_bits > 44 ? 6
+         : input_bits > 21 ? 5
+         : input_bits > 10 ? 4
+         : input_bits > 4  ? 3
+         : input_bits > 1  ? 2
+                           : 1;
 }

 // Although currently the name of this function says that it cannot handle
@ -434,17 +453,17 @@ constexpr int min_log_x_output_bits(int input_bits) {
 // x_max is the largest representable input.  In other words, the output range
 // is symmetric.
 template <int OutputIntegerBits, int InputIntegerBits>
-inline gemmlowp::FixedPoint<int32, OutputIntegerBits>
+inline gemmlowp::FixedPoint<int32_t, OutputIntegerBits>
 log_x_for_x_greater_than_or_equal_to_1_impl(
-    gemmlowp::FixedPoint<int32, InputIntegerBits> input_val) {
-  // assert(__builtin_clz(0u) >= std::numeric_limits<uint32>::digits - 1);
-  // assert(__builtin_clz(0u) <= std::numeric_limits<uint32>::digits);
-  using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
+    gemmlowp::FixedPoint<int32_t, InputIntegerBits> input_val) {
+  // assert(__builtin_clz(0u) >= std::numeric_limits<uint32_t>::digits - 1);
+  // assert(__builtin_clz(0u) <= std::numeric_limits<uint32_t>::digits);
+  using FixedPoint0 = gemmlowp::FixedPoint<int32_t, 0>;
  // The reason for accumulating the result with an extra bit of headroom is
  // that z_pow_2_adj * log_2 might be saturated, and adding num_scaled *
  // recip_denom will otherwise introduce an error.
  static constexpr int kAccumIntegerBits = OutputIntegerBits + 1;
-  using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumIntegerBits>;
+  using FixedPointAccum = gemmlowp::FixedPoint<int32_t, kAccumIntegerBits>;

  const FixedPoint0 log_2 = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
      FixedPoint0, 1488522236, std::log(2.0));
@ -472,10 +491,10 @@ log_x_for_x_greater_than_or_equal_to_1_impl(
  // required shift "ourselves" instead of using, say, Rescale.
  FixedPoint0 z_a = FixedPoint0::FromRaw(input_val.raw());
  // z_a_pow_2 = input_integer_bits - z_a_headroom;
-  int z_a_headroom_plus_1 = CountLeadingZeros(static_cast<uint32>(z_a.raw()));
+  int z_a_headroom_plus_1 = CountLeadingZeros(static_cast<uint32_t>(z_a.raw()));
  FixedPoint0 r_a_tmp =
      SaturatingRoundingMultiplyByPOTParam(z_a, (z_a_headroom_plus_1 - 1));
-  const int32 r_a_raw =
+  const int32_t r_a_raw =
      SaturatingRoundingMultiplyByPOTParam((r_a_tmp * sqrt_half).raw(), 1);
  // z_pow_2_adj = max(z_pow_2_a - 0.75, z_pow_2_b - 0.25);
  // z_pow_2_adj = max(InputIntegerBits - z_a_headroom_plus_1 + 0.25,
@ -487,8 +506,8 @@ log_x_for_x_greater_than_or_equal_to_1_impl(

  // z_b is treated like z_a, but premultiplying by sqrt(0.5).
  FixedPoint0 z_b = z_a * sqrt_half;
-  int z_b_headroom = CountLeadingZeros(static_cast<uint32>(z_b.raw())) - 1;
-  const int32 r_b_raw =
+  int z_b_headroom = CountLeadingZeros(static_cast<uint32_t>(z_b.raw())) - 1;
+  const int32_t r_b_raw =
      SaturatingRoundingMultiplyByPOTParam(z_a.raw(), z_b_headroom);
  const FixedPointAccum z_b_pow_2_adj = SaturatingSub(
      FixedPointAccum::FromRaw(SaturatingRoundingMultiplyByPOTParam(
@ -516,9 +535,9 @@ log_x_for_x_greater_than_or_equal_to_1_impl(
 }

 template <int OutputIntegerBits, int InputIntegerBits>
-inline gemmlowp::FixedPoint<int32, OutputIntegerBits>
+inline gemmlowp::FixedPoint<int32_t, OutputIntegerBits>
 log_x_for_x_greater_than_or_equal_to_1(
-    gemmlowp::FixedPoint<int32, InputIntegerBits> input_val) {
+    gemmlowp::FixedPoint<int32_t, InputIntegerBits> input_val) {
  static_assert(
      OutputIntegerBits >= min_log_x_output_bits(InputIntegerBits),
      "Output integer bits must be sufficient to accommodate logs of inputs.");
@ -527,25 +546,25 @@ log_x_for_x_greater_than_or_equal_to_1(
      input_val);
 }

-inline int32 GetReciprocal(int32 x, int x_integer_digits,
-                           int* num_bits_over_unit) {
-  int headroom_plus_one = CountLeadingZeros(static_cast<uint32>(x));
+inline int32_t GetReciprocal(int32_t x, int x_integer_digits,
+                             int* num_bits_over_unit) {
+  int headroom_plus_one = CountLeadingZeros(static_cast<uint32_t>(x));
  // This is the number of bits to the left of the binary point above 1.0.
  // Consider x=1.25.  In that case shifted_scale=0.8 and
  // no later adjustment will be needed.
  *num_bits_over_unit = x_integer_digits - headroom_plus_one;
-  const int32 shifted_sum_minus_one =
-      static_cast<int32>((static_cast<uint32>(x) << headroom_plus_one) -
-                         (static_cast<uint32>(1) << 31));
+  const int32_t shifted_sum_minus_one =
+      static_cast<int32_t>((static_cast<uint32_t>(x) << headroom_plus_one) -
+                           (static_cast<uint32_t>(1) << 31));

-  gemmlowp::FixedPoint<int32, 0> shifted_scale =
+  gemmlowp::FixedPoint<int32_t, 0> shifted_scale =
      gemmlowp::one_over_one_plus_x_for_x_in_0_1(
-          gemmlowp::FixedPoint<int32, 0>::FromRaw(shifted_sum_minus_one));
+          gemmlowp::FixedPoint<int32_t, 0>::FromRaw(shifted_sum_minus_one));
  return shifted_scale.raw();
 }

-inline void GetInvSqrtQuantizedMultiplierExp(int32 input, int reverse_shift,
-                                             int32* output_inv_sqrt,
+inline void GetInvSqrtQuantizedMultiplierExp(int32_t input, int reverse_shift,
+                                             int32_t* output_inv_sqrt,
                                             int* output_shift) {
  TFLITE_DCHECK_GE(input, 0);
  if (input <= 1) {
@ -565,7 +584,7 @@ inline void GetInvSqrtQuantizedMultiplierExp(int32 input, int reverse_shift,
    ++*output_shift;
  }
  const unsigned max_left_shift_bits =
-      CountLeadingZeros(static_cast<uint32>(input)) - 1;
+      CountLeadingZeros(static_cast<uint32_t>(input)) - 1;
  const unsigned max_left_shift_bit_pairs = max_left_shift_bits / 2;
  const unsigned left_shift_bit_pairs = max_left_shift_bit_pairs - 1;
  *output_shift -= left_shift_bit_pairs;
@ -577,8 +596,8 @@ inline void GetInvSqrtQuantizedMultiplierExp(int32 input, int reverse_shift,
  using gemmlowp::SaturatingRoundingMultiplyByPOT;
  // Using 3 integer bits gives us enough room for the internal arithmetic in
  // this Newton-Raphson iteration.
-  using F3 = FixedPoint<int32, 3>;
-  using F0 = FixedPoint<int32, 0>;
+  using F3 = FixedPoint<int32_t, 3>;
+  using F0 = FixedPoint<int32_t, 0>;
  const F3 fixedpoint_input = F3::FromRaw(input >> 1);
  const F3 fixedpoint_half_input =
      SaturatingRoundingMultiplyByPOT<-1>(fixedpoint_input);
--- a/libs/tensorflow/tensorflow/lite/kernels/internal/compatibility.h
+++ b/libs/tensorflow/tensorflow/lite/kernels/internal/compatibility.h
@ -76,13 +76,15 @@ limitations under the License.
 #define TFLITE_CHECK_LT(x, y) ((x) < (y)) ? (void)0 : TFLITE_ABORT
 #endif

-// TODO(ahentz): Clean up.
+#ifndef TF_LITE_STATIC_MEMORY
+// TODO(b/162019032): Consider removing these type-aliases.
 using int8 = std::int8_t;
 using uint8 = std::uint8_t;
 using int16 = std::int16_t;
 using uint16 = std::uint16_t;
 using int32 = std::int32_t;
 using uint32 = std::uint32_t;
+#endif  // !defined(TF_LITE_STATIC_MEMORY)

 // TFLITE_DEPRECATED()
 //
--- a/libs/tensorflow/tensorflow/lite/kernels/internal/cppmath.h
+++ b/libs/tensorflow/tensorflow/lite/kernels/internal/cppmath.h
@ -19,8 +19,9 @@ limitations under the License.

 namespace tflite {

-#if defined(TF_LITE_USE_GLOBAL_CMATH_FUNCTIONS) || \
-    (defined(__ANDROID__) && !defined(__NDK_MAJOR__)) || defined(ARDUINO)
+#if defined(TF_LITE_USE_GLOBAL_CMATH_FUNCTIONS) ||                           \
+    (defined(__ANDROID__) && !defined(__NDK_MAJOR__)) || defined(ARDUINO) || \
+    defined(__ZEPHYR__)
 #define TF_LITE_GLOBAL_STD_PREFIX
 #else
 #define TF_LITE_GLOBAL_STD_PREFIX std
--- a/libs/tensorflow/tensorflow/lite/kernels/internal/quantization_util.cc
+++ b/libs/tensorflow/tensorflow/lite/kernels/internal/quantization_util.cc
@ -342,13 +342,13 @@ void NudgeQuantizationRange(const float min, const float max,
  const float quant_max_float = static_cast<float>(quant_max);
  *nudged_scale = (max - min) / (quant_max_float - quant_min_float);
  const float zero_point_from_min = quant_min_float - min / *nudged_scale;
-  uint16 nudged_zero_point;
+  uint16_t nudged_zero_point;
  if (zero_point_from_min < quant_min_float) {
-    nudged_zero_point = static_cast<uint16>(quant_min);
+    nudged_zero_point = static_cast<uint16_t>(quant_min);
  } else if (zero_point_from_min > quant_max_float) {
-    nudged_zero_point = static_cast<uint16>(quant_max);
+    nudged_zero_point = static_cast<uint16_t>(quant_max);
  } else {
-    nudged_zero_point = static_cast<uint16>(TfLiteRound(zero_point_from_min));
+    nudged_zero_point = static_cast<uint16_t>(TfLiteRound(zero_point_from_min));
  }
  *nudged_min = (quant_min_float - nudged_zero_point) * (*nudged_scale);
  *nudged_max = (quant_max_float - nudged_zero_point) * (*nudged_scale);
--- a/libs/tensorflow/tensorflow/lite/kernels/internal/reference/add.h
+++ b/libs/tensorflow/tensorflow/lite/kernels/internal/reference/add.h
@ -51,34 +51,39 @@ inline void Add(const ArithmeticParams& params,

 // Element-wise add that can often be used for inner loop of broadcast add as
 // well as the non-broadcast add.
+
+// This function is used for 8-bit as well as for 16-bit, but the accumulator
+// is 32-bit for both cases. The overflow does not happen due to the
+// choice of the shift (20 or 15, accordingly - see add.cc for more comments).
+template <typename T>
 inline void AddElementwise(int size, const ArithmeticParams& params,
-                           const uint8* input1_data, const uint8* input2_data,
-                           uint8* output_data) {
-  TFLITE_DCHECK_GT(params.input1_offset, -256);
-  TFLITE_DCHECK_GT(params.input2_offset, -256);
-  TFLITE_DCHECK_LT(params.input1_offset, 256);
-  TFLITE_DCHECK_LT(params.input2_offset, 256);
+                           const T* input1_data, const T* input2_data,
+                           T* output_data) {
+  TFLITE_DCHECK_GT(params.input1_offset, -std::numeric_limits<T>::max());
+  TFLITE_DCHECK_GT(params.input2_offset, -std::numeric_limits<T>::max());
+  TFLITE_DCHECK_LT(params.input1_offset, std::numeric_limits<T>::max());
+  TFLITE_DCHECK_LT(params.input2_offset, std::numeric_limits<T>::max());

  for (int i = 0; i < size; ++i) {
-    const int32 input1_val = params.input1_offset + input1_data[i];
-    const int32 input2_val = params.input2_offset + input2_data[i];
-    const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
-    const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
-    const int32 scaled_input1_val =
+    const int32_t input1_val = params.input1_offset + input1_data[i];
+    const int32_t input2_val = params.input2_offset + input2_data[i];
+    const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
+    const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
+    const int32_t scaled_input1_val =
        MultiplyByQuantizedMultiplierSmallerThanOneExp(
            shifted_input1_val, params.input1_multiplier, params.input1_shift);
-    const int32 scaled_input2_val =
+    const int32_t scaled_input2_val =
        MultiplyByQuantizedMultiplierSmallerThanOneExp(
            shifted_input2_val, params.input2_multiplier, params.input2_shift);
-    const int32 raw_sum = scaled_input1_val + scaled_input2_val;
-    const int32 raw_output =
+    const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
+    const int32_t raw_output =
        MultiplyByQuantizedMultiplierSmallerThanOneExp(
            raw_sum, params.output_multiplier, params.output_shift) +
        params.output_offset;
-    const int32 clamped_output =
+    const int32_t clamped_output =
        std::min(params.quantized_activation_max,
                 std::max(params.quantized_activation_min, raw_output));
-    output_data[i] = static_cast<uint8>(clamped_output);
+    output_data[i] = static_cast<T>(clamped_output);
  }
 }

@ -86,40 +91,40 @@ inline void AddElementwise(int size, const ArithmeticParams& params,
 // broadcast add, so that, for example, scalar-broadcast with batch will still
 // be fast.
 inline void AddScalarBroadcast(int size, const ArithmeticParams& params,
-                               uint8 input1_data, const uint8* input2_data,
-                               uint8* output_data) {
+                               uint8_t input1_data, const uint8_t* input2_data,
+                               uint8_t* output_data) {
  TFLITE_DCHECK_GT(params.input1_offset, -256);
  TFLITE_DCHECK_GT(params.input2_offset, -256);
  TFLITE_DCHECK_LT(params.input1_offset, 256);
  TFLITE_DCHECK_LT(params.input2_offset, 256);

-  const int32 input1_val = params.input1_offset + input1_data;
-  const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
-  const int32 scaled_input1_val =
+  const int32_t input1_val = params.input1_offset + input1_data;
+  const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
+  const int32_t scaled_input1_val =
      MultiplyByQuantizedMultiplierSmallerThanOneExp(
          shifted_input1_val, params.input1_multiplier, params.input1_shift);
  for (int i = 0; i < size; ++i) {
-    const int32 input2_val = params.input2_offset + input2_data[i];
-    const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
-    const int32 scaled_input2_val =
+    const int32_t input2_val = params.input2_offset + input2_data[i];
+    const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
+    const int32_t scaled_input2_val =
        MultiplyByQuantizedMultiplierSmallerThanOneExp(
            shifted_input2_val, params.input2_multiplier, params.input2_shift);
-    const int32 raw_sum = scaled_input1_val + scaled_input2_val;
-    const int32 raw_output =
+    const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
+    const int32_t raw_output =
        MultiplyByQuantizedMultiplierSmallerThanOneExp(
            raw_sum, params.output_multiplier, params.output_shift) +
        params.output_offset;
-    const int32 clamped_output =
+    const int32_t clamped_output =
        std::min(params.quantized_activation_max,
                 std::max(params.quantized_activation_min, raw_output));
-    output_data[i] = static_cast<uint8>(clamped_output);
+    output_data[i] = static_cast<uint8_t>(clamped_output);
  }
 }

 inline void Add(const ArithmeticParams& params,
-                const RuntimeShape& input1_shape, const uint8* input1_data,
-                const RuntimeShape& input2_shape, const uint8* input2_data,
-                const RuntimeShape& output_shape, uint8* output_data) {
+                const RuntimeShape& input1_shape, const uint8_t* input1_data,
+                const RuntimeShape& input2_shape, const uint8_t* input2_data,
+                const RuntimeShape& output_shape, uint8_t* output_data) {
  TFLITE_DCHECK_LE(params.quantized_activation_min,
                   params.quantized_activation_max);
  const int flat_size =
@ -132,24 +137,53 @@ inline void Add(const ArithmeticParams& params,
  AddElementwise(flat_size, params, input1_data, input2_data, output_data);
 }

+inline void AddGeneralParamScale(const ArithmeticParams& params,
+                                 const RuntimeShape& input1_shape,
+                                 const int16_t* input1_data,
+                                 const RuntimeShape& input2_shape,
+                                 const int16_t* input2_data,
+                                 const RuntimeShape& output_shape,
+                                 int16_t* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  const int flat_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
+  int max_value = std::numeric_limits<int16_t>::max();
+
+  TFLITE_DCHECK_GT(params.input1_offset, -max_value);
+  TFLITE_DCHECK_GT(params.input2_offset, -max_value);
+  TFLITE_DCHECK_LT(params.input1_offset, max_value);
+  TFLITE_DCHECK_LT(params.input2_offset, max_value);
+  AddElementwise(flat_size, params, input1_data, input2_data, output_data);
+}
+
 inline void Add(const ArithmeticParams& params,
-                const RuntimeShape& input1_shape, const int16* input1_data,
-                const RuntimeShape& input2_shape, const int16* input2_data,
-                const RuntimeShape& output_shape, int16* output_data) {
+                const RuntimeShape& input1_shape, const int16_t* input1_data,
+                const RuntimeShape& input2_shape, const int16_t* input2_data,
+                const RuntimeShape& output_shape, int16_t* output_data,
+                bool pot_scale = true) {
+  if (!pot_scale) {
+    AddGeneralParamScale(params, input1_shape, input1_data, input2_shape,
+                         input2_data, output_shape, output_data);
+    return;
+  }
+
  TFLITE_DCHECK_LE(params.quantized_activation_min,
                   params.quantized_activation_max);

  const int input1_shift = params.input1_shift;
  const int flat_size =
      MatchingElementsSize(input1_shape, input2_shape, output_shape);
-  const int16 output_activation_min = params.quantized_activation_min;
-  const int16 output_activation_max = params.quantized_activation_max;
+  const int16_t output_activation_min = params.quantized_activation_min;
+  const int16_t output_activation_max = params.quantized_activation_max;

  TFLITE_DCHECK(input1_shift == 0 || params.input2_shift == 0);
  TFLITE_DCHECK_LE(input1_shift, 0);
  TFLITE_DCHECK_LE(params.input2_shift, 0);
-  const int16* not_shift_input = input1_shift == 0 ? input1_data : input2_data;
-  const int16* shift_input = input1_shift == 0 ? input2_data : input1_data;
+  const int16_t* not_shift_input =
+      input1_shift == 0 ? input1_data : input2_data;
+  const int16_t* shift_input = input1_shift == 0 ? input2_data : input1_data;
  const int input_right_shift =
      input1_shift == 0 ? -params.input2_shift : -input1_shift;

@ -161,8 +195,8 @@ inline void Add(const ArithmeticParams& params,
    F0 scaled_input = F0::FromRaw(
        gemmlowp::RoundingDivideByPOT(shift_input[i], input_right_shift));
    F0 result = gemmlowp::SaturatingAdd(scaled_input, input_ready_scaled);
-    const int16 raw_output = result.raw();
-    const int16 clamped_output = std::min(
+    const int16_t raw_output = result.raw();
+    const int16_t clamped_output = std::min(
        output_activation_max, std::max(output_activation_min, raw_output));
    output_data[i] = clamped_output;
  }
@ -218,11 +252,11 @@ inline void BroadcastAdd4DSlow(const ArithmeticParams& params,

 inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
                               const RuntimeShape& input1_shape,
-                               const int32* input1_data,
+                               const int32_t* input1_data,
                               const RuntimeShape& input2_shape,
-                               const int32* input2_data,
+                               const int32_t* input2_data,
                               const RuntimeShape& output_shape,
-                               int32* output_data) {
+                               int32_t* output_data) {
  NdArrayDesc<4> desc1;
  NdArrayDesc<4> desc2;
  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
@ -257,13 +291,14 @@ inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
  }
 }

-inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
-                               const RuntimeShape& input1_shape,
-                               const uint8* input1_data,
-                               const RuntimeShape& input2_shape,
-                               const uint8* input2_data,
-                               const RuntimeShape& output_shape,
-                               uint8* output_data) {
+// This function is used for 8-bit as well as for 16-bit, but the accumulator
+// is 32-bit for both cases. The overflow does not happen due to the
+// choice of the shift (20 or 15, accordingly - see add.cc for more comments).
+template <typename T>
+inline void BroadcastAdd4DSlow(
+    const ArithmeticParams& params, const RuntimeShape& input1_shape,
+    const T* input1_data, const RuntimeShape& input2_shape,
+    const T* input2_data, const RuntimeShape& output_shape, T* output_data) {
  NdArrayDesc<4> desc1;
  NdArrayDesc<4> desc2;
  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
@ -286,34 +321,34 @@ inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
-          const int32 input1_val =
+          const int32_t input1_val =
              params.input1_offset +
              input1_data[SubscriptToIndex(desc1, b, y, x, c)];
-          const int32 input2_val =
+          const int32_t input2_val =
              params.input2_offset +
              input2_data[SubscriptToIndex(desc2, b, y, x, c)];
-          const int32 shifted_input1_val =
+          const int32_t shifted_input1_val =
              input1_val * (1 << params.left_shift);
-          const int32 shifted_input2_val =
+          const int32_t shifted_input2_val =
              input2_val * (1 << params.left_shift);
-          const int32 scaled_input1_val =
+          const int32_t scaled_input1_val =
              MultiplyByQuantizedMultiplierSmallerThanOneExp(
                  shifted_input1_val, params.input1_multiplier,
                  params.input1_shift);
-          const int32 scaled_input2_val =
+          const int32_t scaled_input2_val =
              MultiplyByQuantizedMultiplierSmallerThanOneExp(
                  shifted_input2_val, params.input2_multiplier,
                  params.input2_shift);
-          const int32 raw_sum = scaled_input1_val + scaled_input2_val;
-          const int32 raw_output =
+          const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
+          const int32_t raw_output =
              MultiplyByQuantizedMultiplierSmallerThanOneExp(
                  raw_sum, params.output_multiplier, params.output_shift) +
              params.output_offset;
-          const int32 clamped_output =
+          const int32_t clamped_output =
              std::min(params.quantized_activation_max,
                       std::max(params.quantized_activation_min, raw_output));
          output_data[Offset(extended_output_shape, b, y, x, c)] =
-              static_cast<uint8>(clamped_output);
+              static_cast<T>(clamped_output);
        }
      }
    }
@ -322,11 +357,11 @@ inline void BroadcastAdd4DSlow(const ArithmeticParams& params,

 inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params,
                                 const RuntimeShape& unswitched_input1_shape,
-                                 const uint8* unswitched_input1_data,
+                                 const uint8_t* unswitched_input1_data,
                                 const RuntimeShape& unswitched_input2_shape,
-                                 const uint8* unswitched_input2_data,
+                                 const uint8_t* unswitched_input2_data,
                                 const RuntimeShape& output_shape,
-                                 uint8* output_data) {
+                                 uint8_t* output_data) {
  ArithmeticParams switched_params = unswitched_params;
  switched_params.input1_offset = unswitched_params.input2_offset;
  switched_params.input1_multiplier = unswitched_params.input2_multiplier;
@ -341,18 +376,18 @@ inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params,

  const ArithmeticParams& params =
      use_unswitched ? unswitched_params : switched_params;
-  const uint8* input1_data =
+  const uint8_t* input1_data =
      use_unswitched ? unswitched_input1_data : unswitched_input2_data;
-  const uint8* input2_data =
+  const uint8_t* input2_data =
      use_unswitched ? unswitched_input2_data : unswitched_input1_data;

  // Fivefold nested loops. The second input resets its position for each
  // iteration of the second loop. The first input resets its position at the
  // beginning of the fourth loop. The innermost loop is an elementwise add of
  // sections of the arrays.
-  uint8* output_data_ptr = output_data;
-  const uint8* input1_data_ptr = input1_data;
-  const uint8* input2_data_reset = input2_data;
+  uint8_t* output_data_ptr = output_data;
+  const uint8_t* input1_data_ptr = input1_data;
+  const uint8_t* input2_data_reset = input2_data;
  // In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared
  // between input shapes. y3 for input 1 is always broadcast, and so the
  // dimension there is 1, whereas optionally y1 might be broadcast for input 2.
@ -368,7 +403,7 @@ inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params,
    // General fivefold pattern, with y4 > 1 so there is a non-broadcast inner
    // dimension.
    for (int i0 = 0; i0 < y0; ++i0) {
-      const uint8* input2_data_ptr;
+      const uint8_t* input2_data_ptr;
      for (int i1 = 0; i1 < y1; ++i1) {
        input2_data_ptr = input2_data_reset;
        for (int i2 = 0; i2 < y2; ++i2) {
@ -397,7 +432,7 @@ inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params,
    // for y4 == 1 and the loop over y3 is contained within the
    // AddScalarBroadcast function.
    for (int i0 = 0; i0 < y0; ++i0) {
-      const uint8* input2_data_ptr;
+      const uint8_t* input2_data_ptr;
      for (int i1 = 0; i1 < y1; ++i1) {
        input2_data_ptr = input2_data_reset;
        for (int i2 = 0; i2 < y2; ++i2) {
--- a/libs/tensorflow/tensorflow/lite/kernels/internal/reference/comparisons.h
+++ b/libs/tensorflow/tensorflow/lite/kernels/internal/reference/comparisons.h
@ -18,7 +18,6 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/types.h"
-#include "tensorflow/lite/string_util.h"

 namespace tflite {

@ -51,18 +50,6 @@ inline bool LessEqualFn(T lhs, T rhs) {
  return lhs <= rhs;
 }

-inline bool StringRefEqualFn(const StringRef& lhs, const StringRef& rhs) {
-  if (lhs.len != rhs.len) return false;
-  for (int i = 0; i < lhs.len; ++i) {
-    if (lhs.str[i] != rhs.str[i]) return false;
-  }
-  return true;
-}
-
-inline bool StringRefNotEqualFn(const StringRef& lhs, const StringRef& rhs) {
-  return !StringRefEqualFn(lhs, rhs);
-}
-
 template <typename T>
 using ComparisonFn = bool (*)(T, T);

@ -78,22 +65,6 @@ inline void ComparisonImpl(
  }
 }

-inline void ComparisonStringImpl(bool (*F)(const StringRef&, const StringRef&),
-                                 const RuntimeShape& input1_shape,
-                                 const TfLiteTensor* input1,
-                                 const RuntimeShape& input2_shape,
-                                 const TfLiteTensor* input2,
-                                 const RuntimeShape& output_shape,
-                                 bool* output_data) {
-  const int64_t flatsize =
-      MatchingFlatSize(input1_shape, input2_shape, output_shape);
-  for (int64_t i = 0; i < flatsize; ++i) {
-    const auto lhs = GetString(input1, i);
-    const auto rhs = GetString(input2, i);
-    output_data[i] = F(lhs, rhs);
-  }
-}
-
 template <ComparisonFn<float> F>
 inline void Comparison(const ComparisonParams& op_params,
                       const RuntimeShape& input1_shape,
@ -105,30 +76,30 @@ inline void Comparison(const ComparisonParams& op_params,
                           input2_data, output_shape, output_data);
 }

-template <typename T, ComparisonFn<int32> F>
+template <typename T, ComparisonFn<int32_t> F>
 inline void ComparisonWithScaling(
    const ComparisonParams& op_params, const RuntimeShape& input1_shape,
    const T* input1_data, const RuntimeShape& input2_shape,
    const T* input2_data, const RuntimeShape& output_shape, bool* output_data) {
  int left_shift = op_params.left_shift;
-  int32 input1_offset = op_params.input1_offset;
-  int32 input1_multiplier = op_params.input1_multiplier;
+  int32_t input1_offset = op_params.input1_offset;
+  int32_t input1_multiplier = op_params.input1_multiplier;
  int input1_shift = op_params.input1_shift;
-  int32 input2_offset = op_params.input2_offset;
-  int32 input2_multiplier = op_params.input2_multiplier;
+  int32_t input2_offset = op_params.input2_offset;
+  int32_t input2_multiplier = op_params.input2_multiplier;
  int input2_shift = op_params.input2_shift;

  const int64_t flatsize =
      MatchingFlatSize(input1_shape, input2_shape, output_shape);
  for (int64_t i = 0; i < flatsize; ++i) {
-    const int32 input1_val = input1_offset + input1_data[i];
-    const int32 input2_val = input2_offset + input2_data[i];
-    const int32 shifted_input1_val = input1_val * (1 << left_shift);
-    const int32 shifted_input2_val = input2_val * (1 << left_shift);
-    const int32 scaled_input1_val =
+    const int32_t input1_val = input1_offset + input1_data[i];
+    const int32_t input2_val = input2_offset + input2_data[i];
+    const int32_t shifted_input1_val = input1_val * (1 << left_shift);
+    const int32_t shifted_input2_val = input2_val * (1 << left_shift);
+    const int32_t scaled_input1_val =
        MultiplyByQuantizedMultiplierSmallerThanOneExp(
            shifted_input1_val, input1_multiplier, input1_shift);
-    const int32 scaled_input2_val =
+    const int32_t scaled_input2_val =
        MultiplyByQuantizedMultiplierSmallerThanOneExp(
            shifted_input2_val, input2_multiplier, input2_shift);
    output_data[i] = F(scaled_input1_val, scaled_input2_val);
@ -180,31 +151,6 @@ inline void BroadcastComparison4DSlowImpl(
  }
 }

-inline void BroadcastComparison4DSlowStringImpl(
-    bool (*F)(const StringRef&, const StringRef&),
-    const RuntimeShape& unextended_input1_shape, const TfLiteTensor* input1,
-    const RuntimeShape& unextended_input2_shape, const TfLiteTensor* input2,
-    const RuntimeShape& unextended_output_shape, bool* output_data) {
-  const BroadcastComparison4DSlowCommon dims =
-      BroadcastComparison4DSlowPreprocess(unextended_input1_shape,
-                                          unextended_input2_shape,
-                                          unextended_output_shape);
-
-  for (int b = 0; b < dims.output_shape.Dims(0); ++b) {
-    for (int y = 0; y < dims.output_shape.Dims(1); ++y) {
-      for (int x = 0; x < dims.output_shape.Dims(2); ++x) {
-        for (int c = 0; c < dims.output_shape.Dims(3); ++c) {
-          const auto lhs =
-              GetString(input1, SubscriptToIndex(dims.desc1, b, y, x, c));
-          const auto rhs =
-              GetString(input2, SubscriptToIndex(dims.desc2, b, y, x, c));
-          output_data[Offset(dims.output_shape, b, y, x, c)] = F(lhs, rhs);
-        }
-      }
-    }
-  }
-}
-
 template <ComparisonFn<float> F>
 inline void BroadcastComparison4DSlow(const ComparisonParams& op_params,
                                      const RuntimeShape& input1_shape,
@ -218,7 +164,7 @@ inline void BroadcastComparison4DSlow(const ComparisonParams& op_params,
                                          output_shape, output_data);
 }

-template <typename T, ComparisonFn<int32> F>
+template <typename T, ComparisonFn<int32_t> F>
 inline void BroadcastComparison4DSlowWithScaling(
    const ComparisonParams& op_params,
    const RuntimeShape& unextended_input1_shape, const T* input1_data,
@ -230,29 +176,29 @@ inline void BroadcastComparison4DSlowWithScaling(
                                          unextended_output_shape);

  int left_shift = op_params.left_shift;
-  int32 input1_offset = op_params.input1_offset;
-  int32 input1_multiplier = op_params.input1_multiplier;
+  int32_t input1_offset = op_params.input1_offset;
+  int32_t input1_multiplier = op_params.input1_multiplier;
  int input1_shift = op_params.input1_shift;
-  int32 input2_offset = op_params.input2_offset;
-  int32 input2_multiplier = op_params.input2_multiplier;
+  int32_t input2_offset = op_params.input2_offset;
+  int32_t input2_multiplier = op_params.input2_multiplier;
  int input2_shift = op_params.input2_shift;

  for (int b = 0; b < dims.output_shape.Dims(0); ++b) {
    for (int y = 0; y < dims.output_shape.Dims(1); ++y) {
      for (int x = 0; x < dims.output_shape.Dims(2); ++x) {
        for (int c = 0; c < dims.output_shape.Dims(3); ++c) {
-          const int32 input1_val =
+          const int32_t input1_val =
              input1_offset +
              input1_data[SubscriptToIndex(dims.desc1, b, y, x, c)];
-          const int32 input2_val =
+          const int32_t input2_val =
              input2_offset +
              input2_data[SubscriptToIndex(dims.desc2, b, y, x, c)];
-          const int32 shifted_input1_val = input1_val * (1 << left_shift);
-          const int32 shifted_input2_val = input2_val * (1 << left_shift);
-          const int32 scaled_input1_val =
+          const int32_t shifted_input1_val = input1_val * (1 << left_shift);
+          const int32_t shifted_input2_val = input2_val * (1 << left_shift);
+          const int32_t scaled_input1_val =
              MultiplyByQuantizedMultiplierSmallerThanOneExp(
                  shifted_input1_val, input1_multiplier, input1_shift);
-          const int32 scaled_input2_val =
+          const int32_t scaled_input2_val =
              MultiplyByQuantizedMultiplierSmallerThanOneExp(
                  shifted_input2_val, input2_multiplier, input2_shift);
          output_data[Offset(dims.output_shape, b, y, x, c)] =
--- a/libs/tensorflow/tensorflow/lite/kernels/internal/reference/concatenation.h
+++ b/libs/tensorflow/tensorflow/lite/kernels/internal/reference/concatenation.h
@ -74,14 +74,14 @@ inline void Concatenation(const ConcatenationParams& params,
 // when optimizng this routine further.
 inline void ConcatenationWithScaling(const ConcatenationParams& params,
                                     const RuntimeShape* const* input_shapes,
-                                     const uint8* const* input_data,
+                                     const uint8_t* const* input_data,
                                     const RuntimeShape& output_shape,
-                                     uint8* output_data) {
+                                     uint8_t* output_data) {
  int axis = params.axis;
-  const int32* input_zeropoint = params.input_zeropoint;
+  const int32_t* input_zeropoint = params.input_zeropoint;
  const float* input_scale = params.input_scale;
  int inputs_count = params.inputs_count;
-  const int32 output_zeropoint = params.output_zeropoint;
+  const int32_t output_zeropoint = params.output_zeropoint;
  const float output_scale = params.output_scale;

  const int concat_dimensions = output_shape.DimensionsCount();
@ -110,11 +110,11 @@ inline void ConcatenationWithScaling(const ConcatenationParams& params,
  }

  const float inverse_output_scale = 1.f / output_scale;
-  uint8* output_ptr = output_data;
+  uint8_t* output_ptr = output_data;
  for (int k = 0; k < outer_size; k++) {
    for (int i = 0; i < inputs_count; ++i) {
      const int copy_size = input_shapes[i]->Dims(axis) * base_inner_size;
-      const uint8* input_ptr = input_data[i] + k * copy_size;
+      const uint8_t* input_ptr = input_data[i] + k * copy_size;
      if (input_zeropoint[i] == output_zeropoint &&
          input_scale[i] == output_scale) {
        memcpy(output_ptr, input_ptr, copy_size);
--- a/libs/tensorflow/tensorflow/lite/kernels/internal/reference/conv.h
+++ b/libs/tensorflow/tensorflow/lite/kernels/internal/reference/conv.h
@ -99,11 +99,11 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
 }

 inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
-                 const uint8* input_data, const RuntimeShape& filter_shape,
-                 const uint8* filter_data, const RuntimeShape& bias_shape,
-                 const int32* bias_data, const RuntimeShape& output_shape,
-                 uint8* output_data, const RuntimeShape& im2col_shape,
-                 uint8* im2col_data, void* cpu_backend_context) {
+                 const uint8_t* input_data, const RuntimeShape& filter_shape,
+                 const uint8_t* filter_data, const RuntimeShape& bias_shape,
+                 const int32_t* bias_data, const RuntimeShape& output_shape,
+                 uint8_t* output_data, const RuntimeShape& im2col_shape,
+                 uint8_t* im2col_data, void* cpu_backend_context) {
  (void)cpu_backend_context;  // only used in optimized code.
  (void)im2col_data;   // only used in optimized code.
  (void)im2col_shape;  // only used in optimized code.
@ -113,13 +113,13 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
  const int dilation_height_factor = params.dilation_height_factor;
  const int pad_width = params.padding_values.width;
  const int pad_height = params.padding_values.height;
-  const int32 input_offset = params.input_offset;
-  const int32 filter_offset = params.weights_offset;
-  const int32 output_offset = params.output_offset;
-  const int32 output_multiplier = params.output_multiplier;
+  const int32_t input_offset = params.input_offset;
+  const int32_t filter_offset = params.weights_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_multiplier = params.output_multiplier;
  const int output_shift = params.output_shift;
-  const int32 output_activation_min = params.quantized_activation_min;
-  const int32 output_activation_max = params.quantized_activation_max;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);

  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
@ -143,7 +143,7 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
          const int in_x_origin = (out_x * stride_width) - pad_width;
          const int in_y_origin = (out_y * stride_height) - pad_height;
-          int32 acc = 0;
+          int32_t acc = 0;
          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
              for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
@ -154,9 +154,9 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
                // use zero as a default value.
                if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
                    (in_y < input_height)) {
-                  int32 input_val = input_data[Offset(input_shape, batch, in_y,
-                                                      in_x, in_channel)];
-                  int32 filter_val =
+                  int32_t input_val = input_data[Offset(
+                      input_shape, batch, in_y, in_x, in_channel)];
+                  int32_t filter_val =
                      filter_data[Offset(filter_shape, out_channel, filter_y,
                                         filter_x, in_channel)];
                  acc +=
@ -174,7 +174,7 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
          acc = std::max(acc, output_activation_min);
          acc = std::min(acc, output_activation_max);
          output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
-              static_cast<uint8>(acc);
+              static_cast<uint8_t>(acc);
        }
      }
    }
@ -220,7 +220,7 @@ inline void HybridConvPerChannel(
        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
          const int in_x_origin = (out_x * stride_width) - pad_width;
          const int in_y_origin = (out_y * stride_height) - pad_height;
-          int32 acc = 0;
+          int32_t acc = 0;
          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
              for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
@ -231,9 +231,9 @@ inline void HybridConvPerChannel(
                // use zero as a default value.
                if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
                    (in_y < input_height)) {
-                  int32 input_val = input_data[Offset(input_shape, batch, in_y,
-                                                      in_x, in_channel)];
-                  int32 filter_val =
+                  int32_t input_val = input_data[Offset(
+                      input_shape, batch, in_y, in_x, in_channel)];
+                  int32_t filter_val =
                      filter_data[Offset(filter_shape, out_channel, filter_y,
                                         filter_x, in_channel)];
                  acc += filter_val * (input_val - input_offset[batch]);
--- a/libs/tensorflow/tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h
+++ b/libs/tensorflow/tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h
@ -62,21 +62,21 @@ namespace reference_ops {
 namespace depthwise_conv {

 template <DepthwiseConvOutputRounding output_rounding>
-inline int32 DepthwiseConvRound(int32 x, int32 quantized_multiplier,
-                                int shift) {
+inline int32_t DepthwiseConvRound(int32_t x, int32_t quantized_multiplier,
+                                  int shift) {
  TFLITE_DCHECK_NE(output_rounding, DepthwiseConvOutputRounding::kNone);
  return MultiplyByQuantizedMultiplier(x, quantized_multiplier, shift);
 }

 template <>
-inline int32 DepthwiseConvRound<DepthwiseConvOutputRounding::kAwayFromZero>(
-    int32 x, int32 quantized_multiplier, int shift) {
+inline int32_t DepthwiseConvRound<DepthwiseConvOutputRounding::kAwayFromZero>(
+    int32_t x, int32_t quantized_multiplier, int shift) {
  return MultiplyByQuantizedMultiplier(x, quantized_multiplier, shift);
 }

 template <>
-inline int32 DepthwiseConvRound<DepthwiseConvOutputRounding::kUpward>(
-    int32 x, int32 quantized_multiplier, int shift) {
+inline int32_t DepthwiseConvRound<DepthwiseConvOutputRounding::kUpward>(
+    int32_t x, int32_t quantized_multiplier, int shift) {
  using gemmlowp::SaturatingRoundingDoublingHighMul;
  const int left_shift = shift > 0 ? shift : 0;
  const int right_shift = shift > 0 ? 0 : -shift;
@ -89,13 +89,12 @@ inline int32 DepthwiseConvRound<DepthwiseConvOutputRounding::kUpward>(

 template <DepthwiseConvOutputRounding output_rounding>
 struct DepthwiseConvBasicKernel {
-  static inline void Run(const DepthwiseParams& params,
-                         const RuntimeShape& input_shape,
-                         const uint8* input_data,
-                         const RuntimeShape& filter_shape,
-                         const uint8* filter_data,
-                         const RuntimeShape& bias_shape, const int32* bias_data,
-                         const RuntimeShape& output_shape, uint8* output_data) {
+  static inline void Run(
+      const DepthwiseParams& params, const RuntimeShape& input_shape,
+      const uint8_t* input_data, const RuntimeShape& filter_shape,
+      const uint8_t* filter_data, const RuntimeShape& bias_shape,
+      const int32_t* bias_data, const RuntimeShape& output_shape,
+      uint8_t* output_data) {
    const int stride_width = params.stride_width;
    const int stride_height = params.stride_height;
    const int dilation_width_factor = params.dilation_width_factor;
@ -103,12 +102,12 @@ struct DepthwiseConvBasicKernel {
    const int pad_width = params.padding_values.width;
    const int pad_height = params.padding_values.height;
    const int depth_multiplier = params.depth_multiplier;
-    const int32 output_activation_min = params.quantized_activation_min;
-    const int32 output_activation_max = params.quantized_activation_max;
-    const int32 input_offset = params.input_offset;
-    const int32 filter_offset = params.weights_offset;
-    const int32 output_offset = params.output_offset;
-    const int32 output_multiplier = params.output_multiplier;
+    const int32_t output_activation_min = params.quantized_activation_min;
+    const int32_t output_activation_max = params.quantized_activation_max;
+    const int32_t input_offset = params.input_offset;
+    const int32_t filter_offset = params.weights_offset;
+    const int32_t output_offset = params.output_offset;
+    const int32_t output_multiplier = params.output_multiplier;
    const int output_shift = params.output_shift;
    TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
    TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
@ -135,7 +134,7 @@ struct DepthwiseConvBasicKernel {
              const int oc = m + ic * depth_multiplier;
              const int in_x_origin = (out_x * stride_width) - pad_width;
              const int in_y_origin = (out_y * stride_height) - pad_height;
-              int32 acc = 0;
+              int32_t acc = 0;
              for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
                for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
                  const int in_x =
@ -146,9 +145,9 @@ struct DepthwiseConvBasicKernel {
                  // use zero as a default value.
                  if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
                      (in_y < input_height)) {
-                    int32 input_val =
+                    int32_t input_val =
                        input_data[Offset(input_shape, b, in_y, in_x, ic)];
-                    int32 filter_val = filter_data[Offset(
+                    int32_t filter_val = filter_data[Offset(
                        filter_shape, 0, filter_y, filter_x, oc)];
                    acc += (filter_val + filter_offset) *
                           (input_val + input_offset);
@ -164,7 +163,7 @@ struct DepthwiseConvBasicKernel {
              acc = std::max(acc, output_activation_min);
              acc = std::min(acc, output_activation_max);
              output_data[Offset(output_shape, b, out_y, out_x, oc)] =
-                  static_cast<uint8>(acc);
+                  static_cast<uint8_t>(acc);
            }
          }
        }
@ -176,10 +175,10 @@ struct DepthwiseConvBasicKernel {
  // MultiplyByQuantizedMultiplier or DepthwiseConvRound function.
  static inline void RunPerChannel(
      const DepthwiseParams& params, const RuntimeShape& input_shape,
-      const int8* input_data, const RuntimeShape& filter_shape,
-      const int8* filter_data, const RuntimeShape& bias_shape,
-      const int32* bias_data, const RuntimeShape& output_shape,
-      int8* output_data) {
+      const int8_t* input_data, const RuntimeShape& filter_shape,
+      const int8_t* filter_data, const RuntimeShape& bias_shape,
+      const int32_t* bias_data, const RuntimeShape& output_shape,
+      int8_t* output_data) {
    // Get parameters.
    // TODO(b/141565753): Re-introduce ScopedProfilingLabel on Micro.
    const int stride_width = params.stride_width;
@ -189,12 +188,12 @@ struct DepthwiseConvBasicKernel {
    const int pad_width = params.padding_values.width;
    const int pad_height = params.padding_values.height;
    const int depth_multiplier = params.depth_multiplier;
-    const int32 input_offset = params.input_offset;
-    const int32 output_offset = params.output_offset;
-    const int32 output_activation_min = params.quantized_activation_min;
-    const int32 output_activation_max = params.quantized_activation_max;
-    const int32* output_multiplier = params.output_multiplier_per_channel;
-    const int32* output_shift = params.output_shift_per_channel;
+    const int32_t input_offset = params.input_offset;
+    const int32_t output_offset = params.output_offset;
+    const int32_t output_activation_min = params.quantized_activation_min;
+    const int32_t output_activation_max = params.quantized_activation_max;
+    const int32_t* output_multiplier = params.output_multiplier_per_channel;
+    const int32_t* output_shift = params.output_shift_per_channel;

    // Check dimensions of the tensors.
    TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
@ -222,7 +221,7 @@ struct DepthwiseConvBasicKernel {
              const int output_channel = m + in_channel * depth_multiplier;
              const int in_x_origin = (out_x * stride_width) - pad_width;
              const int in_y_origin = (out_y * stride_height) - pad_height;
-              int32 acc = 0;
+              int32_t acc = 0;
              for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
                for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
                  const int in_x =
@ -234,17 +233,18 @@ struct DepthwiseConvBasicKernel {
                      (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
                      (in_y < input_height);
                  if (is_point_inside_image) {
-                    int32 input_val = input_data[Offset(
+                    int32_t input_val = input_data[Offset(
                        input_shape, batch, in_y, in_x, in_channel)];
-                    int32 filter_val = filter_data[Offset(
+                    int32_t filter_val = filter_data[Offset(
                        filter_shape, 0, filter_y, filter_x, output_channel)];
                    // Accumulate with 32 bits accumulator.
                    // In the nudging process during model quantization, we
                    // force real value of 0.0 be represented by a quantized
-                    // value. This guarantees that the input_offset is a int8,
-                    // even though it is represented using int32. int32 += int8
-                    // * (int8 - int8) so the highest value we can get from each
-                    // accumulation is [-127, 127] * ([-128, 127] -
+                    // value. This guarantees that the input_offset is a int8_t,
+                    // even though it is represented using int32_t. int32_t +=
+                    // int8_t
+                    // * (int8_t - int8_t) so the highest value we can get from
+                    // each accumulation is [-127, 127] * ([-128, 127] -
                    // [-128, 127]), which is [-32512, 32512]. log2(32512)
                    // = 14.98, which means we can accumulate at least 2^16
                    // multiplications without overflow. The accumulator is
@ -279,10 +279,10 @@ struct DepthwiseConvBasicKernel {

 inline void DepthwiseConv(
    const DepthwiseParams& params, const RuntimeShape& input_shape,
-    const uint8* input_data, const RuntimeShape& filter_shape,
-    const uint8* filter_data, const RuntimeShape& bias_shape,
-    const int32* bias_data, const RuntimeShape& output_shape,
-    uint8* output_data) {
+    const uint8_t* input_data, const RuntimeShape& filter_shape,
+    const uint8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    uint8_t* output_data) {
  return depthwise_conv::DepthwiseConvBasicKernel<
      DepthwiseConvOutputRounding::kAwayFromZero>::Run(params, input_shape,
                                                       input_data, filter_shape,
--- a/libs/tensorflow/tensorflow/lite/kernels/internal/reference/dequantize.h
+++ b/libs/tensorflow/tensorflow/lite/kernels/internal/reference/dequantize.h
@ -32,12 +32,12 @@ inline void Dequantize(const tflite::DequantizationParams& op_params,
                       const RuntimeShape& input_shape,
                       const InputT* input_data,
                       const RuntimeShape& output_shape, OutputT* output_data) {
-  int32 zero_point = op_params.zero_point;
+  int32_t zero_point = op_params.zero_point;
  const double scale = op_params.scale;
  const int flat_size = MatchingFlatSize(input_shape, output_shape);

  for (int i = 0; i < flat_size; i++) {
-    const int32 val = input_data[i];
+    const int32_t val = input_data[i];
    const OutputT result = static_cast<OutputT>(scale * (val - zero_point));
    output_data[i] = result;
  }
@ -52,11 +52,11 @@ inline void PerChannelDequantize(
  // Ensure flat size is same.
  MatchingFlatSize(input_shape, output_shape);

-  const int32* zero_point = op_params.zero_point;
+  const int32_t* zero_point = op_params.zero_point;
  const float* scale = op_params.scale;
-  const int32 quantized_dimension = op_params.quantized_dimension;
-  const int32 num_dims = input_shape.DimensionsCount();
-  const int32* dims_data = input_shape.DimsData();
+  const int32_t quantized_dimension = op_params.quantized_dimension;
+  const int32_t num_dims = input_shape.DimensionsCount();
+  const int32_t* dims_data = input_shape.DimsData();
  std::vector<int> current_dim(num_dims, 0);

  do {
@ -64,7 +64,7 @@ inline void PerChannelDequantize(
        ReducedOutputOffset(num_dims, reinterpret_cast<const int*>(dims_data),
                            current_dim.data(), 0, nullptr);
    const int channel = current_dim[quantized_dimension];
-    const int32 val = input_data[offset];
+    const int32_t val = input_data[offset];
    const float result =
        static_cast<float>(scale[channel] * (val - zero_point[channel]));
    output_data[offset] = result;
--- a/libs/tensorflow/tensorflow/lite/kernels/internal/reference/fully_connected.h
+++ b/libs/tensorflow/tensorflow/lite/kernels/internal/reference/fully_connected.h
@ -61,17 +61,17 @@ inline void FullyConnected(

 inline void FullyConnected(
    const FullyConnectedParams& params, const RuntimeShape& input_shape,
-    const uint8* input_data, const RuntimeShape& filter_shape,
-    const uint8* filter_data, const RuntimeShape& bias_shape,
-    const int32* bias_data, const RuntimeShape& output_shape,
-    uint8* output_data) {
-  const int32 input_offset = params.input_offset;
-  const int32 filter_offset = params.weights_offset;
-  const int32 output_offset = params.output_offset;
-  const int32 output_multiplier = params.output_multiplier;
+    const uint8_t* input_data, const RuntimeShape& filter_shape,
+    const uint8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    uint8_t* output_data) {
+  const int32_t input_offset = params.input_offset;
+  const int32_t filter_offset = params.weights_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_multiplier = params.output_multiplier;
  const int output_shift = params.output_shift;
-  const int32 output_activation_min = params.quantized_activation_min;
-  const int32 output_activation_max = params.quantized_activation_max;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
  TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
  TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);

@ -89,10 +89,10 @@ inline void FullyConnected(
  const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
  for (int b = 0; b < batches; ++b) {
    for (int out_c = 0; out_c < output_depth; ++out_c) {
-      int32 acc = 0;
+      int32_t acc = 0;
      for (int d = 0; d < accum_depth; ++d) {
-        int32 input_val = input_data[b * accum_depth + d];
-        int32 filter_val = filter_data[out_c * accum_depth + d];
+        int32_t input_val = input_data[b * accum_depth + d];
+        int32_t filter_val = filter_data[out_c * accum_depth + d];
        acc += (filter_val + filter_offset) * (input_val + input_offset);
      }
      if (bias_data) {
@ -102,24 +102,24 @@ inline void FullyConnected(
      acc += output_offset;
      acc = std::max(acc, output_activation_min);
      acc = std::min(acc, output_activation_max);
-      output_data[out_c + output_depth * b] = static_cast<uint8>(acc);
+      output_data[out_c + output_depth * b] = static_cast<uint8_t>(acc);
    }
  }
 }

 inline void FullyConnected(
    const FullyConnectedParams& params, const RuntimeShape& input_shape,
-    const uint8* input_data, const RuntimeShape& filter_shape,
-    const uint8* filter_data, const RuntimeShape& bias_shape,
-    const int32* bias_data, const RuntimeShape& output_shape,
-    int16* output_data) {
-  const int32 input_offset = params.input_offset;
-  const int32 filter_offset = params.weights_offset;
-  const int32 output_offset = params.output_offset;
-  const int32 output_multiplier = params.output_multiplier;
+    const uint8_t* input_data, const RuntimeShape& filter_shape,
+    const uint8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    int16_t* output_data) {
+  const int32_t input_offset = params.input_offset;
+  const int32_t filter_offset = params.weights_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_multiplier = params.output_multiplier;
  const int output_shift = params.output_shift;
-  const int32 output_activation_min = params.quantized_activation_min;
-  const int32 output_activation_max = params.quantized_activation_max;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;

  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
  TFLITE_DCHECK_EQ(output_offset, 0);
@ -138,20 +138,21 @@ inline void FullyConnected(
    for (int out_c = 0; out_c < output_depth; ++out_c) {
      // Internal accumulation.
      // Initialize accumulator with the bias-value.
-      int32 accum = bias_data[out_c];
+      int32_t accum = bias_data[out_c];
      // Accumulation loop.
      for (int d = 0; d < accum_depth; ++d) {
-        int16 input_val = input_data[b * accum_depth + d] + input_offset;
-        int16 filter_val = filter_data[out_c * accum_depth + d] + filter_offset;
+        int16_t input_val = input_data[b * accum_depth + d] + input_offset;
+        int16_t filter_val =
+            filter_data[out_c * accum_depth + d] + filter_offset;
        accum += filter_val * input_val;
      }
-      // Down-scale the final int32 accumulator to the scale used by our
+      // Down-scale the final int32_t accumulator to the scale used by our
      // (16-bit, typically 3 integer bits) fixed-point format. The quantized
      // multiplier and shift here have been pre-computed offline
      // (e.g. by toco).
      accum =
          MultiplyByQuantizedMultiplier(accum, output_multiplier, output_shift);
-      // Saturate, cast to int16, and store to output array.
+      // Saturate, cast to int16_t, and store to output array.
      accum = std::max(accum, output_activation_min - output_offset);
      accum = std::min(accum, output_activation_max - output_offset);
      accum += output_offset;
@ -162,14 +163,14 @@ inline void FullyConnected(

 inline void ShuffledFullyConnected(
    const FullyConnectedParams& params, const RuntimeShape& input_shape,
-    const uint8* input_data, const RuntimeShape& weights_shape,
-    const uint8* shuffled_weights_data, const RuntimeShape& bias_shape,
-    const int32* bias_data, const RuntimeShape& output_shape,
-    int16* output_data, uint8* shuffled_input_workspace_data) {
-  const int32 output_multiplier = params.output_multiplier;
+    const uint8_t* input_data, const RuntimeShape& weights_shape,
+    const uint8_t* shuffled_weights_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    int16_t* output_data, uint8_t* shuffled_input_workspace_data) {
+  const int32_t output_multiplier = params.output_multiplier;
  const int output_shift = params.output_shift;
-  const int32 output_activation_min = params.quantized_activation_min;
-  const int32 output_activation_max = params.quantized_activation_max;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);

  TFLITE_DCHECK_GE(input_shape.DimensionsCount(), 1);
@ -190,7 +191,7 @@ inline void ShuffledFullyConnected(
  TFLITE_DCHECK((output_depth % 4) == 0);

  // Shuffling and xoring of input activations into the workspace buffer
-  uint8* shuffled_input_workspace_ptr = shuffled_input_workspace_data;
+  uint8_t* shuffled_input_workspace_ptr = shuffled_input_workspace_data;
  if (batches == 1) {
    for (int i = 0; i < accum_depth; i++) {
      shuffled_input_workspace_data[i] = input_data[i] ^ 0x80;
@ -198,13 +199,13 @@ inline void ShuffledFullyConnected(
  } else if (batches == 4) {
    for (int c = 0; c < accum_depth; c += 16) {
      for (int b = 0; b < 4; b++) {
-        const uint8* src_data_ptr = input_data + b * accum_depth + c;
+        const uint8_t* src_data_ptr = input_data + b * accum_depth + c;
        for (int j = 0; j < 16; j++) {
-          uint8 src_val = *src_data_ptr++;
+          uint8_t src_val = *src_data_ptr++;
          // Flip the sign bit, so that the kernel will only need to
-          // reinterpret these uint8 values as int8, getting for free the
+          // reinterpret these uint8_t values as int8_t, getting for free the
          // subtraction of the zero_point value 128.
-          uint8 dst_val = src_val ^ 0x80;
+          uint8_t dst_val = src_val ^ 0x80;
          *shuffled_input_workspace_ptr++ = dst_val;
        }
      }
@ -216,62 +217,62 @@ inline void ShuffledFullyConnected(

  // Actual computation
  if (batches == 1) {
-    int16* output_ptr = output_data;
+    int16_t* output_ptr = output_data;
    // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd)
-    // so that just reinterpreting them as int8 values is equivalent to
+    // so that just reinterpreting them as int8_t values is equivalent to
    // subtracting 128 from them, thus implementing for free the subtraction of
    // the zero_point value 128.
-    const int8* shuffled_weights_ptr =
-        reinterpret_cast<const int8*>(shuffled_weights_data);
+    const int8_t* shuffled_weights_ptr =
+        reinterpret_cast<const int8_t*>(shuffled_weights_data);
    // Likewise, we preshuffled and pre-xored the input data above.
-    const int8* shuffled_input_data =
-        reinterpret_cast<const int8*>(shuffled_input_workspace_data);
+    const int8_t* shuffled_input_data =
+        reinterpret_cast<const int8_t*>(shuffled_input_workspace_data);
    for (int c = 0; c < output_depth; c += 4) {
      // Internal accumulation.
      // Initialize accumulator with the bias-value.
-      int32 accum[4] = {0};
+      int32_t accum[4] = {0};
      // Accumulation loop.
      for (int d = 0; d < accum_depth; d += 16) {
        for (int i = 0; i < 4; i++) {
          for (int j = 0; j < 16; j++) {
-            int8 input_val = shuffled_input_data[d + j];
-            int8 weights_val = *shuffled_weights_ptr++;
+            int8_t input_val = shuffled_input_data[d + j];
+            int8_t weights_val = *shuffled_weights_ptr++;
            accum[i] += weights_val * input_val;
          }
        }
      }
      for (int i = 0; i < 4; i++) {
        // Add bias value
-        int32 acc = accum[i] + bias_data[c + i];
-        // Down-scale the final int32 accumulator to the scale used by our
+        int32_t acc = accum[i] + bias_data[c + i];
+        // Down-scale the final int32_t accumulator to the scale used by our
        // (16-bit, typically 3 integer bits) fixed-point format. The quantized
        // multiplier and shift here have been pre-computed offline
        // (e.g. by toco).
        acc =
            MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
-        // Saturate, cast to int16, and store to output array.
+        // Saturate, cast to int16_t, and store to output array.
        acc = std::max(acc, output_activation_min);
        acc = std::min(acc, output_activation_max);
        output_ptr[c + i] = acc;
      }
    }
  } else if (batches == 4) {
-    int16* output_ptr = output_data;
+    int16_t* output_ptr = output_data;
    // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd)
-    // so that just reinterpreting them as int8 values is equivalent to
+    // so that just reinterpreting them as int8_t values is equivalent to
    // subtracting 128 from them, thus implementing for free the subtraction of
    // the zero_point value 128.
-    const int8* shuffled_weights_ptr =
-        reinterpret_cast<const int8*>(shuffled_weights_data);
+    const int8_t* shuffled_weights_ptr =
+        reinterpret_cast<const int8_t*>(shuffled_weights_data);
    // Likewise, we preshuffled and pre-xored the input data above.
-    const int8* shuffled_input_data =
-        reinterpret_cast<const int8*>(shuffled_input_workspace_data);
+    const int8_t* shuffled_input_data =
+        reinterpret_cast<const int8_t*>(shuffled_input_workspace_data);
    for (int c = 0; c < output_depth; c += 4) {
-      const int8* shuffled_input_ptr = shuffled_input_data;
+      const int8_t* shuffled_input_ptr = shuffled_input_data;
      // Accumulation loop.
      // Internal accumulation.
      // Initialize accumulator with the bias-value.
-      int32 accum[4][4];
+      int32_t accum[4][4];
      for (int i = 0; i < 4; i++) {
        for (int b = 0; b < 4; b++) {
          accum[i][b] = 0;
@ -281,8 +282,8 @@ inline void ShuffledFullyConnected(
        for (int i = 0; i < 4; i++) {
          for (int b = 0; b < 4; b++) {
            for (int j = 0; j < 16; j++) {
-              int8 input_val = shuffled_input_ptr[16 * b + j];
-              int8 weights_val = shuffled_weights_ptr[16 * i + j];
+              int8_t input_val = shuffled_input_ptr[16 * b + j];
+              int8_t weights_val = shuffled_weights_ptr[16 * i + j];
              accum[i][b] += weights_val * input_val;
            }
          }
@ -293,14 +294,14 @@ inline void ShuffledFullyConnected(
      for (int i = 0; i < 4; i++) {
        for (int b = 0; b < 4; b++) {
          // Add bias value
-          int32 acc = accum[i][b] + bias_data[c + i];
-          // Down-scale the final int32 accumulator to the scale used by our
+          int32_t acc = accum[i][b] + bias_data[c + i];
+          // Down-scale the final int32_t accumulator to the scale used by our
          // (16-bit, typically 3 integer bits) fixed-point format. The
          // quantized multiplier and shift here have been pre-computed offline
          // (e.g. by toco).
          acc = MultiplyByQuantizedMultiplier(acc, output_multiplier,
                                              output_shift);
-          // Saturate, cast to int16, and store to output array.
+          // Saturate, cast to int16_t, and store to output array.
          acc = std::max(acc, output_activation_min);
          acc = std::min(acc, output_activation_max);
          output_ptr[b * output_depth + c + i] = acc;
--- a/libs/tensorflow/tensorflow/lite/kernels/internal/reference/integer_ops/add.h
+++ b/libs/tensorflow/tensorflow/lite/kernels/internal/reference/integer_ops/add.h
@ -23,34 +23,41 @@ limitations under the License.
 namespace tflite {
 namespace reference_integer_ops {

+inline void CheckArithmeticParams(const ArithmeticParams& params) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  // Input offset is negative input zero point. Activation tensors are
+  // asymmetric quantized so they span the full int8 range.
+  TFLITE_DCHECK_GE(-params.input1_offset, std::numeric_limits<int8_t>::min());
+  TFLITE_DCHECK_GE(-params.input2_offset, std::numeric_limits<int8_t>::min());
+  TFLITE_DCHECK_LE(-params.input1_offset, std::numeric_limits<int8_t>::max());
+  TFLITE_DCHECK_LE(-params.input2_offset, std::numeric_limits<int8_t>::max());
+}
+
 // Element-wise add that can often be used for inner loop of broadcast add as
 // well as the non-broadcast add.
 inline void AddElementwise(int size, const ArithmeticParams& params,
                           const int8_t* input1_data, const int8_t* input2_data,
                           int8_t* output_data) {
-  const int32_t int8_max_value = std::numeric_limits<int8_t>::max();
-  TFLITE_DCHECK_GE(params.input1_offset, -1 * int8_max_value);
-  TFLITE_DCHECK_GE(params.input2_offset, -1 * int8_max_value);
-  TFLITE_DCHECK_LE(params.input1_offset, int8_max_value);
-  TFLITE_DCHECK_LE(params.input2_offset, int8_max_value);
+  CheckArithmeticParams(params);

  for (int i = 0; i < size; ++i) {
-    const int32 input1_val = params.input1_offset + input1_data[i];
-    const int32 input2_val = params.input2_offset + input2_data[i];
-    const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
-    const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
-    const int32 scaled_input1_val =
+    const int32_t input1_val = params.input1_offset + input1_data[i];
+    const int32_t input2_val = params.input2_offset + input2_data[i];
+    const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
+    const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
+    const int32_t scaled_input1_val =
        MultiplyByQuantizedMultiplierSmallerThanOneExp(
            shifted_input1_val, params.input1_multiplier, params.input1_shift);
-    const int32 scaled_input2_val =
+    const int32_t scaled_input2_val =
        MultiplyByQuantizedMultiplierSmallerThanOneExp(
            shifted_input2_val, params.input2_multiplier, params.input2_shift);
-    const int32 raw_sum = scaled_input1_val + scaled_input2_val;
-    const int32 raw_output =
+    const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
+    const int32_t raw_output =
        MultiplyByQuantizedMultiplierSmallerThanOneExp(
            raw_sum, params.output_multiplier, params.output_shift) +
        params.output_offset;
-    const int32 clamped_output =
+    const int32_t clamped_output =
        std::min(params.quantized_activation_max,
                 std::max(params.quantized_activation_min, raw_output));
    output_data[i] = static_cast<int8_t>(clamped_output);
@ -61,16 +68,11 @@ inline void Add(const ArithmeticParams& params,
                const RuntimeShape& input1_shape, const int8_t* input1_data,
                const RuntimeShape& input2_shape, const int8_t* input2_data,
                const RuntimeShape& output_shape, int8_t* output_data) {
-  TFLITE_DCHECK_LE(params.quantized_activation_min,
-                   params.quantized_activation_max);
+  CheckArithmeticParams(params);
+
  const int flat_size =
      MatchingElementsSize(input1_shape, input2_shape, output_shape);

-  const int32_t int8_max_value = std::numeric_limits<int8_t>::max();
-  TFLITE_DCHECK_GE(params.input1_offset, -1 * int8_max_value);
-  TFLITE_DCHECK_GE(params.input2_offset, -1 * int8_max_value);
-  TFLITE_DCHECK_LE(params.input1_offset, int8_max_value);
-  TFLITE_DCHECK_LE(params.input2_offset, int8_max_value);
  AddElementwise(flat_size, params, input1_data, input2_data, output_data);
 }

--- a/libs/tensorflow/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h
+++ b/libs/tensorflow/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h
@ -22,27 +22,27 @@ namespace reference_integer_ops {

 // Fixed-point per-channel-quantization convolution reference kernel.
 inline void ConvPerChannel(
-    const ConvParams& params, const int32* output_multiplier,
-    const int32* output_shift, const RuntimeShape& input_shape,
-    const int8* input_data, const RuntimeShape& filter_shape,
-    const int8* filter_data, const RuntimeShape& bias_shape,
-    const int32* bias_data, const RuntimeShape& output_shape,
-    int8* output_data) {
+    const ConvParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const RuntimeShape& input_shape,
+    const int8_t* input_data, const RuntimeShape& filter_shape,
+    const int8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    int8_t* output_data) {
  // Get parameters.
-  const int32 input_offset = params.input_offset;  // r = s(q - Z)
+  const int32_t input_offset = params.input_offset;  // r = s(q - Z)
  const int stride_width = params.stride_width;
  const int stride_height = params.stride_height;
  const int dilation_width_factor = params.dilation_width_factor;
  const int dilation_height_factor = params.dilation_height_factor;
  const int pad_width = params.padding_values.width;
  const int pad_height = params.padding_values.height;
-  const int32 output_offset = params.output_offset;
+  const int32_t output_offset = params.output_offset;

  // Set min and max value of the output.
-  const int32 output_activation_min = params.quantized_activation_min;
-  const int32 output_activation_max = params.quantized_activation_max;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;

-  // Sanity check.
+  // Consistency check.
  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
@ -67,7 +67,7 @@ inline void ConvPerChannel(
        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
          const int in_x_origin = (out_x * stride_width) - pad_width;
          const int in_y_origin = (out_y * stride_height) - pad_height;
-          int32 acc = 0;
+          int32_t acc = 0;
          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
              for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
@ -79,18 +79,18 @@ inline void ConvPerChannel(
                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
                    (in_y < input_height);
                if (is_point_inside_image) {
-                  int32 input_val = input_data[Offset(input_shape, batch, in_y,
-                                                      in_x, in_channel)];
-                  int32 filter_val =
+                  int32_t input_val = input_data[Offset(
+                      input_shape, batch, in_y, in_x, in_channel)];
+                  int32_t filter_val =
                      filter_data[Offset(filter_shape, out_channel, filter_y,
                                         filter_x, in_channel)];
                  // Accumulate with 32 bits accumulator.
                  // In the nudging process during model quantization, we force
                  // real value of 0.0 be represented by a quantized value. This
-                  // guarantees that the input_offset is a int8, even though it
-                  // is represented using int32.
-                  // int32 += int8 * (int8 - int8) so the highest value we can
-                  // get from each accumulation is [-127, 127] * ([-128, 127] -
+                  // guarantees that the input_offset is a int8_t, even though
+                  // it is represented using int32_t. int32_t += int8_t *
+                  // (int8_t - int8_t) so the highest value we can get from each
+                  // accumulation is [-127, 127] * ([-128, 127] -
                  // [-128, 127]), which is [-32512, 32512]. log2(32512)
                  // = 14.98, which means we can accumulate at least 2^16
                  // multiplications without overflow. The accumulator is
@ -125,12 +125,12 @@ inline void ConvPerChannel(
 // Fixed-point per-channel-quantization convolution reference kernel.
 // 16-bit data and 8-bit filter
 inline void ConvPerChannel(
-    const ConvParams& params, const int32* output_multiplier,
-    const int32* output_shift, const RuntimeShape& input_shape,
-    const int16* input_data, const RuntimeShape& filter_shape,
-    const int8* filter_data, const RuntimeShape& bias_shape,
+    const ConvParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const RuntimeShape& input_shape,
+    const int16_t* input_data, const RuntimeShape& filter_shape,
+    const int8_t* filter_data, const RuntimeShape& bias_shape,
    const std::int64_t* bias_data, const RuntimeShape& output_shape,
-    int16* output_data) {
+    int16_t* output_data) {
  // Get parameters.
  const int stride_width = params.stride_width;
  const int stride_height = params.stride_height;
@ -140,10 +140,10 @@ inline void ConvPerChannel(
  const int pad_height = params.padding_values.height;

  // Set min and max value of the output.
-  const int32 output_activation_min = params.quantized_activation_min;
-  const int32 output_activation_max = params.quantized_activation_max;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;

-  // Sanity check.
+  // Consistency check.
  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
@ -180,13 +180,13 @@ inline void ConvPerChannel(
                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
                    (in_y < input_height);
                if (is_point_inside_image) {
-                  int32 input_val = input_data[Offset(input_shape, batch, in_y,
-                                                      in_x, in_channel)];
-                  int32 filter_val =
+                  int32_t input_val = input_data[Offset(
+                      input_shape, batch, in_y, in_x, in_channel)];
+                  int32_t filter_val =
                      filter_data[Offset(filter_shape, out_channel, filter_y,
                                         filter_x, in_channel)];
                  // Accumulate with 64 bits accumulator.
-                  // int64 += int8 * int16 so the highest value we can
+                  // int64_t += int8_t * int16_t so the highest value we can
                  // get from each accumulation is [-127, 127] * ([-32768,
                  // 32767] -
                  // [-32768, 32767]), which is [-8322945, 8322945].
--- a/libs/tensorflow/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h
+++ b/libs/tensorflow/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h
@ -20,12 +20,12 @@ limitations under the License.
 namespace tflite {
 namespace reference_integer_ops {
 inline void DepthwiseConvPerChannel(
-    const DepthwiseParams& params, const int32* output_multiplier,
-    const int32* output_shift, const RuntimeShape& input_shape,
-    const int8* input_data, const RuntimeShape& filter_shape,
-    const int8* filter_data, const RuntimeShape& bias_shape,
-    const int32* bias_data, const RuntimeShape& output_shape,
-    int8* output_data) {
+    const DepthwiseParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const RuntimeShape& input_shape,
+    const int8_t* input_data, const RuntimeShape& filter_shape,
+    const int8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    int8_t* output_data) {
  // Get parameters.
  // TODO(b/141565753): Re-introduce ScopedProfilingLabel on Micro.
  const int stride_width = params.stride_width;
@ -35,10 +35,10 @@ inline void DepthwiseConvPerChannel(
  const int pad_width = params.padding_values.width;
  const int pad_height = params.padding_values.height;
  const int depth_multiplier = params.depth_multiplier;
-  const int32 input_offset = params.input_offset;
-  const int32 output_offset = params.output_offset;
-  const int32 output_activation_min = params.quantized_activation_min;
-  const int32 output_activation_max = params.quantized_activation_max;
+  const int32_t input_offset = params.input_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;

  // Check dimensions of the tensors.
  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
@ -66,7 +66,7 @@ inline void DepthwiseConvPerChannel(
            const int output_channel = m + in_channel * depth_multiplier;
            const int in_x_origin = (out_x * stride_width) - pad_width;
            const int in_y_origin = (out_y * stride_height) - pad_height;
-            int32 acc = 0;
+            int32_t acc = 0;
            for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
              for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
                const int in_x = in_x_origin + dilation_width_factor * filter_x;
@ -77,17 +77,17 @@ inline void DepthwiseConvPerChannel(
                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
                    (in_y < input_height);
                if (is_point_inside_image) {
-                  int32 input_val = input_data[Offset(input_shape, batch, in_y,
-                                                      in_x, in_channel)];
-                  int32 filter_val = filter_data[Offset(
+                  int32_t input_val = input_data[Offset(
+                      input_shape, batch, in_y, in_x, in_channel)];
+                  int32_t filter_val = filter_data[Offset(
                      filter_shape, 0, filter_y, filter_x, output_channel)];
                  // Accumulate with 32 bits accumulator.
                  // In the nudging process during model quantization, we force
                  // real value of 0.0 be represented by a quantized value. This
-                  // guarantees that the input_offset is a int8, even though it
-                  // is represented using int32.
-                  // int32 += int8 * (int8 - int8) so the highest value we can
-                  // get from each accumulation is [-127, 127] * ([-128, 127] -
+                  // guarantees that the input_offset is a int8_t, even though
+                  // it is represented using int32_t. int32_t += int8_t *
+                  // (int8_t - int8_t) so the highest value we can get from each
+                  // accumulation is [-127, 127] * ([-128, 127] -
                  // [-128, 127]), which is [-32512, 32512]. log2(32512)
                  // = 14.98, which means we can accumulate at least 2^16
                  // multiplications without overflow. The accumulator is
@ -120,12 +120,12 @@ inline void DepthwiseConvPerChannel(
 }

 inline void DepthwiseConvPerChannel(
-    const DepthwiseParams& params, const int32* output_multiplier,
-    const int32* output_shift, const RuntimeShape& input_shape,
-    const int16* input_data, const RuntimeShape& filter_shape,
-    const int8* filter_data, const RuntimeShape& bias_shape,
+    const DepthwiseParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const RuntimeShape& input_shape,
+    const int16_t* input_data, const RuntimeShape& filter_shape,
+    const int8_t* filter_data, const RuntimeShape& bias_shape,
    const std::int64_t* bias_data, const RuntimeShape& output_shape,
-    int16* output_data) {
+    int16_t* output_data) {
  // Get parameters.
  const int stride_width = params.stride_width;
  const int stride_height = params.stride_height;
@ -134,8 +134,8 @@ inline void DepthwiseConvPerChannel(
  const int pad_width = params.padding_values.width;
  const int pad_height = params.padding_values.height;
  const int depth_multiplier = params.depth_multiplier;
-  const int32 output_activation_min = params.quantized_activation_min;
-  const int32 output_activation_max = params.quantized_activation_max;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;

  // Check dimensions of the tensors.
  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
@ -174,9 +174,9 @@ inline void DepthwiseConvPerChannel(
                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
                    (in_y < input_height);
                if (is_point_inside_image) {
-                  int32 input_val = input_data[Offset(input_shape, batch, in_y,
-                                                      in_x, in_channel)];
-                  int32 filter_val = filter_data[Offset(
+                  int32_t input_val = input_data[Offset(
+                      input_shape, batch, in_y, in_x, in_channel)];
+                  int32_t filter_val = filter_data[Offset(
                      filter_shape, 0, filter_y, filter_x, output_channel)];
                  // Accumulate with 64 bits accumulator.
                  // We assume maximum of 2^16 accumulations as with the 8-bit
@ -190,7 +190,7 @@ inline void DepthwiseConvPerChannel(
            if (bias_data) {
              acc += bias_data[output_channel];
            }
-            int32 scaled_acc = MultiplyByQuantizedMultiplier(
+            int32_t scaled_acc = MultiplyByQuantizedMultiplier(
                acc, output_multiplier[output_channel],
                output_shift[output_channel]);
            scaled_acc = std::max(scaled_acc, output_activation_min);
@ -207,8 +207,8 @@ inline void DepthwiseConvPerChannel(

 inline void DepthwiseConvHybridPerChannel(
    const DepthwiseParams& params, float* scaling_factors_ptr,
-    const RuntimeShape& input_shape, const int8* input_data,
-    const RuntimeShape& filter_shape, const int8* filter_data,
+    const RuntimeShape& input_shape, const int8_t* input_data,
+    const RuntimeShape& filter_shape, const int8_t* filter_data,
    const RuntimeShape& bias_shape, const float* bias_data,
    const RuntimeShape& output_shape, float* output_data,
    const float* per_channel_scale, int32_t* input_offset) {
@ -247,7 +247,7 @@ inline void DepthwiseConvHybridPerChannel(
            const int output_channel = m + in_channel * depth_multiplier;
            const int in_x_origin = (out_x * stride_width) - pad_width;
            const int in_y_origin = (out_y * stride_height) - pad_height;
-            int32 acc = 0;
+            int32_t acc = 0;
            for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
              for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
                const int in_x = in_x_origin + dilation_width_factor * filter_x;
@ -258,9 +258,9 @@ inline void DepthwiseConvHybridPerChannel(
                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
                    (in_y < input_height);
                if (is_point_inside_image) {
-                  int32 input_val = input_data[Offset(input_shape, batch, in_y,
-                                                      in_x, in_channel)];
-                  int32 filter_val = filter_data[Offset(
+                  int32_t input_val = input_data[Offset(
+                      input_shape, batch, in_y, in_x, in_channel)];
+                  int32_t filter_val = filter_data[Offset(
                      filter_shape, 0, filter_y, filter_x, output_channel)];
                  acc += filter_val * (input_val - input_offset[batch]);
                }
--- a/libs/tensorflow/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h
+++ b/libs/tensorflow/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h
@ -24,15 +24,15 @@ inline void FullyConnected(
    const FullyConnectedParams& params, const RuntimeShape& input_shape,
    const int8_t* input_data, const RuntimeShape& filter_shape,
    const int8_t* filter_data, const RuntimeShape& bias_shape,
-    const int32* bias_data, const RuntimeShape& output_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
    int8_t* output_data) {
-  const int32 input_offset = params.input_offset;
-  const int32 filter_offset = params.weights_offset;
-  const int32 output_offset = params.output_offset;
-  const int32 output_multiplier = params.output_multiplier;
+  const int32_t input_offset = params.input_offset;
+  const int32_t filter_offset = params.weights_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_multiplier = params.output_multiplier;
  const int output_shift = params.output_shift;
-  const int32 output_activation_min = params.quantized_activation_min;
-  const int32 output_activation_max = params.quantized_activation_max;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
  TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 2);

@ -44,10 +44,10 @@ inline void FullyConnected(
  const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
  for (int b = 0; b < batches; ++b) {
    for (int out_c = 0; out_c < output_depth; ++out_c) {
-      int32 acc = 0;
+      int32_t acc = 0;
      for (int d = 0; d < accum_depth; ++d) {
-        int32 input_val = input_data[b * accum_depth + d];
-        int32 filter_val = filter_data[out_c * accum_depth + d];
+        int32_t input_val = input_data[b * accum_depth + d];
+        int32_t filter_val = filter_data[out_c * accum_depth + d];
        acc += (filter_val + filter_offset) * (input_val + input_offset);
      }
      if (bias_data) {
@ -68,11 +68,11 @@ inline void FullyConnected(
    const int8_t* filter_data, const RuntimeShape& bias_shape,
    const int64_t* bias_data, const RuntimeShape& output_shape,
    int16_t* output_data) {
-  const int32 filter_offset = params.weights_offset;
-  const int32 output_multiplier = params.output_multiplier;
+  const int32_t filter_offset = params.weights_offset;
+  const int32_t output_multiplier = params.output_multiplier;
  const int output_shift = params.output_shift;
-  const int32 output_activation_min = params.quantized_activation_min;
-  const int32 output_activation_max = params.quantized_activation_max;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
  TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 2);

@ -86,8 +86,8 @@ inline void FullyConnected(
    for (int out_c = 0; out_c < output_depth; ++out_c) {
      int64_t acc = 0;
      for (int d = 0; d < accum_depth; ++d) {
-        int32 input_val = input_data[b * accum_depth + d];
-        int32 filter_val = filter_data[out_c * accum_depth + d];
+        int32_t input_val = input_data[b * accum_depth + d];
+        int32_t filter_val = filter_data[out_c * accum_depth + d];
        acc += (filter_val + filter_offset) * input_val;
      }
      if (bias_data) {
--- a/libs/tensorflow/tensorflow/lite/kernels/internal/reference/integer_ops/l2normalization.h
+++ b/libs/tensorflow/tensorflow/lite/kernels/internal/reference/integer_ops/l2normalization.h
@ -21,8 +21,8 @@ namespace tflite {
 namespace reference_integer_ops {

 inline void L2Normalization(int32_t input_zero_point, int32_t outer_size,
-                            int32_t depth, const int8* input_data,
-                            int8* output_data) {
+                            int32_t depth, const int8_t* input_data,
+                            int8_t* output_data) {
  static constexpr int8_t kMinInt8 = std::numeric_limits<int8_t>::min();
  static constexpr int8_t kMaxInt8 = std::numeric_limits<int8_t>::max();
  // The output scale must be in sync with Prepare().
@ -30,7 +30,7 @@ inline void L2Normalization(int32_t input_zero_point, int32_t outer_size,
  // to [-1, 127/128].
  static constexpr int32_t kOutputScale = 7;
  for (int outer_index = 0; outer_index < outer_size; ++outer_index) {
-    // int32 = (int8 - int8) ^ 2.
+    // int32_t = (int8_t - int8_t) ^ 2.
    // ([-128, 127] - [-128, 127]) ^ 2 = [0, (2^8 - 1)^2] so the accumulator is
    // safe from overflowing in at least 2^16 steps.
    int32_t acc = 0;
@ -55,7 +55,7 @@ inline void L2Normalization(int32_t input_zero_point, int32_t outer_size,
          std::min(static_cast<int32_t>(kMaxInt8),
                   std::max(static_cast<int32_t>(kMinInt8), output_in_q24));
      output_data[depth * outer_index + inner_index] =
-          static_cast<int8>(output_in_q24);
+          static_cast<int8_t>(output_in_q24);
    }
  }
 }
--- a/libs/tensorflow/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h
+++ b/libs/tensorflow/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h
@ -27,14 +27,14 @@ inline void MulElementwise(int size, const ArithmeticParams& params,
                           const T* input1_data, const T* input2_data,
                           T* output_data) {
  for (int i = 0; i < size; ++i) {
-    const int32 input1_val = params.input1_offset + input1_data[i];
-    const int32 input2_val = params.input2_offset + input2_data[i];
-    const int32 unclamped_result =
+    const int32_t input1_val = params.input1_offset + input1_data[i];
+    const int32_t input2_val = params.input2_offset + input2_data[i];
+    const int32_t unclamped_result =
        params.output_offset +
        MultiplyByQuantizedMultiplier(input1_val * input2_val,
                                      params.output_multiplier,
                                      params.output_shift);
-    const int32 clamped_output =
+    const int32_t clamped_output =
        std::min(params.quantized_activation_max,
                 std::max(params.quantized_activation_min, unclamped_result));
    output_data[i] = static_cast<T>(clamped_output);
@ -57,13 +57,13 @@ inline void Mul(const ArithmeticParams& params,

 // Mul with 16 bit inputs and int8_t outputs.
 inline void Mul(const ArithmeticParams& params,
-                const RuntimeShape& input1_shape, const int16* input1_data,
-                const RuntimeShape& input2_shape, const int16* input2_data,
+                const RuntimeShape& input1_shape, const int16_t* input1_data,
+                const RuntimeShape& input2_shape, const int16_t* input2_data,
                const RuntimeShape& output_shape, int8_t* output_data) {
  ruy::profiler::ScopeLabel label("Mul/Int16Int8");
-  int32 output_offset = params.output_offset;
-  int32 output_activation_min = params.quantized_activation_min;
-  int32 output_activation_max = params.quantized_activation_max;
+  int32_t output_offset = params.output_offset;
+  int32_t output_activation_min = params.quantized_activation_min;
+  int32_t output_activation_max = params.quantized_activation_max;
  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);

  const int flat_size =
@ -75,12 +75,12 @@ inline void Mul(const ArithmeticParams& params,

    F0 unclamped_result =
        F0::FromRaw(input1_data[i]) * F0::FromRaw(input2_data[i]);
-    int16 rescaled_result =
+    int16_t rescaled_result =
        gemmlowp::RoundingDivideByPOT(unclamped_result.raw(), 8);
-    int16 clamped_result =
-        std::min<int16>(output_activation_max - output_offset, rescaled_result);
-    clamped_result =
-        std::max<int16>(output_activation_min - output_offset, clamped_result);
+    int16_t clamped_result = std::min<int16_t>(
+        output_activation_max - output_offset, rescaled_result);
+    clamped_result = std::max<int16_t>(output_activation_min - output_offset,
+                                       clamped_result);
    output_data[i] = output_offset + clamped_result;
  }
 }
@ -104,18 +104,18 @@ inline void BroadcastMul4DSlow(
    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
-          const int32 input1_val =
+          const int32_t input1_val =
              params.input1_offset +
              input1_data[SubscriptToIndex(desc1, b, y, x, c)];
-          const int32 input2_val =
+          const int32_t input2_val =
              params.input2_offset +
              input2_data[SubscriptToIndex(desc2, b, y, x, c)];
-          const int32 unclamped_result =
+          const int32_t unclamped_result =
              params.output_offset +
              MultiplyByQuantizedMultiplier(input1_val * input2_val,
                                            params.output_multiplier,
                                            params.output_shift);
-          const int32 clamped_output = std::min(
+          const int32_t clamped_output = std::min(
              params.quantized_activation_max,
              std::max(params.quantized_activation_min, unclamped_result));
          output_data[Offset(extended_output_shape, b, y, x, c)] =
--- a/libs/tensorflow/tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h
+++ b/libs/tensorflow/tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h
@ -22,8 +22,9 @@ namespace tflite {
 namespace reference_integer_ops {

 inline void AveragePool(const PoolParams& params,
-                        const RuntimeShape& input_shape, const int8* input_data,
-                        const RuntimeShape& output_shape, int8* output_data) {
+                        const RuntimeShape& input_shape,
+                        const int8_t* input_data,
+                        const RuntimeShape& output_shape, int8_t* output_data) {
  TFLITE_DCHECK_LE(params.quantized_activation_min,
                   params.quantized_activation_max);
  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
@ -52,7 +53,7 @@ inline void AveragePool(const PoolParams& params,
          const int filter_y_start = std::max(0, -in_y_origin);
          const int filter_y_end =
              std::min(params.filter_height, input_height - in_y_origin);
-          int32 acc = 0;
+          int32_t acc = 0;
          int filter_count = 0;
          for (int filter_y = filter_y_start; filter_y < filter_y_end;
               ++filter_y) {
@ -71,7 +72,7 @@ inline void AveragePool(const PoolParams& params,
          acc = std::max(acc, params.quantized_activation_min);
          acc = std::min(acc, params.quantized_activation_max);
          output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
-              static_cast<int8>(acc);
+              static_cast<int8_t>(acc);
        }
      }
    }
@ -79,8 +80,8 @@ inline void AveragePool(const PoolParams& params,
 }

 inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
-                    const int8* input_data, const RuntimeShape& output_shape,
-                    int8* output_data) {
+                    const int8_t* input_data, const RuntimeShape& output_shape,
+                    int8_t* output_data) {
  TFLITE_DCHECK_LE(params.quantized_activation_min,
                   params.quantized_activation_max);
  TFLITE_DCHECK_GE(params.quantized_activation_min,
@ -137,8 +138,9 @@ inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,

 inline void AveragePool(const PoolParams& params,
                        const RuntimeShape& input_shape,
-                        const int16* input_data,
-                        const RuntimeShape& output_shape, int16* output_data) {
+                        const int16_t* input_data,
+                        const RuntimeShape& output_shape,
+                        int16_t* output_data) {
  TFLITE_DCHECK_LE(params.quantized_activation_min,
                   params.quantized_activation_max);
  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
@ -167,7 +169,7 @@ inline void AveragePool(const PoolParams& params,
          const int filter_y_start = std::max(0, -in_y_origin);
          const int filter_y_end =
              std::min(params.filter_height, input_height - in_y_origin);
-          int32 acc = 0;
+          int32_t acc = 0;
          int filter_count = 0;
          for (int filter_y = filter_y_start; filter_y < filter_y_end;
               ++filter_y) {
@ -186,7 +188,7 @@ inline void AveragePool(const PoolParams& params,
          acc = std::max(acc, params.quantized_activation_min);
          acc = std::min(acc, params.quantized_activation_max);
          output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
-              static_cast<int16>(acc);
+              static_cast<int16_t>(acc);
        }
      }
    }
@ -194,8 +196,8 @@ inline void AveragePool(const PoolParams& params,
 }

 inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
-                    const int16* input_data, const RuntimeShape& output_shape,
-                    int16* output_data) {
+                    const int16_t* input_data, const RuntimeShape& output_shape,
+                    int16_t* output_data) {
  TFLITE_DCHECK_LE(params.quantized_activation_min,
                   params.quantized_activation_max);
  TFLITE_DCHECK_GE(params.quantized_activation_min,
--- a/libs/tensorflow/tensorflow/lite/kernels/internal/reference/l2normalization.h
+++ b/libs/tensorflow/tensorflow/lite/kernels/internal/reference/l2normalization.h
@ -52,40 +52,39 @@ inline void L2Normalization(const tflite::L2NormalizationParams& op_params,

 inline void L2Normalization(const tflite::L2NormalizationParams& op_params,
                            const RuntimeShape& input_shape,
-                            const uint8* input_data,
+                            const uint8_t* input_data,
                            const RuntimeShape& output_shape,
-                            uint8* output_data) {
+                            uint8_t* output_data) {
  const int trailing_dim = input_shape.DimensionsCount() - 1;
  const int depth =
      MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
  const int outer_size =
      MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
-  const int32 input_zero_point = op_params.input_zero_point;
+  const int32_t input_zero_point = op_params.input_zero_point;

  for (int i = 0; i < outer_size; ++i) {
-    int32 square_l2_norm = 0;
+    int32_t square_l2_norm = 0;
    for (int c = 0; c < depth; c++) {
-      int32 diff = input_data[depth * i + c] - input_zero_point;
+      int32_t diff = input_data[depth * i + c] - input_zero_point;
      square_l2_norm += diff * diff;
    }
-    int32 inv_l2norm_multiplier;
+    int32_t inv_l2norm_multiplier;
    int inv_l2norm_shift;
    GetInvSqrtQuantizedMultiplierExp(square_l2_norm, kReverseShift,
                                     &inv_l2norm_multiplier, &inv_l2norm_shift);
    for (int c = 0; c < depth; c++) {
-      int32 diff = input_data[depth * i + c] - input_zero_point;
-      int32 rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOneExp(
+      int32_t diff = input_data[depth * i + c] - input_zero_point;
+      int32_t rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOneExp(
          128 * diff, inv_l2norm_multiplier, inv_l2norm_shift);
-      int32 unclamped_output_val = 128 + rescaled_diff;
-      int32 output_val =
-          std::min(static_cast<int32>(255),
-                   std::max(static_cast<int32>(0), unclamped_output_val));
-      output_data[depth * i + c] = static_cast<uint8>(output_val);
+      int32_t unclamped_output_val = 128 + rescaled_diff;
+      int32_t output_val =
+          std::min(static_cast<int32_t>(255),
+                   std::max(static_cast<int32_t>(0), unclamped_output_val));
+      output_data[depth * i + c] = static_cast<uint8_t>(output_val);
    }
  }
 }

-
 }  // namespace reference_ops
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_L2NORMALIZATION_H_
--- a/libs/tensorflow/tensorflow/lite/kernels/internal/reference/logistic.h
+++ b/libs/tensorflow/tensorflow/lite/kernels/internal/reference/logistic.h
@ -66,8 +66,8 @@ inline void Logistic(const LogisticParams&, const RuntimeShape& input_shape,
 }

 inline void Logistic(const LogisticParams& params,
-                     const RuntimeShape& input_shape, const int16* input_data,
-                     const RuntimeShape& output_shape, int16* output_data) {
+                     const RuntimeShape& input_shape, const int16_t* input_data,
+                     const RuntimeShape& output_shape, int16_t* output_data) {
  const int flat_size = MatchingFlatSize(input_shape, output_shape);

  for (int i = 0; i < flat_size; i++) {
@ -84,12 +84,12 @@ inline void Logistic(const LogisticParams& params,
  }
 }

-// Quantized int8 logistic activation.  Cheats by dequantizing and requantizing
-// around the floating point logistic method.  This implementation is slow on
-// platforms without a floating point unit.
+// Quantized int8_t logistic activation.  Cheats by dequantizing and
+// requantizing around the floating point logistic method.  This implementation
+// is slow on platforms without a floating point unit.

-// TODO(b/141211002): Delete this int8 implementation once we can reuse the
-// approach used in TFLite for int8 Logistic.
+// TODO(b/141211002): Delete this int8_t implementation once we can reuse the
+// approach used in TFLite for int8_t Logistic.
 inline void Logistic(const RuntimeShape& input_shape, const int8_t* input_data,
                     float input_scale, int input_zero_point,
                     const RuntimeShape& output_shape, int8_t* output_data,
--- a/libs/tensorflow/tensorflow/lite/kernels/internal/reference/mul.h
+++ b/libs/tensorflow/tensorflow/lite/kernels/internal/reference/mul.h
@ -24,20 +24,20 @@ namespace reference_ops {
 // Element-wise mul that can often be used for inner loop of broadcast Mul as
 // well as the non-broadcast Mul.
 inline void MulElementwise(int size, const ArithmeticParams& params,
-                           const uint8* input1_data, const uint8* input2_data,
-                           uint8* output_data) {
+                           const uint8_t* input1_data,
+                           const uint8_t* input2_data, uint8_t* output_data) {
  for (int i = 0; i < size; ++i) {
-    const int32 input1_val = params.input1_offset + input1_data[i];
-    const int32 input2_val = params.input2_offset + input2_data[i];
-    const int32 unclamped_result =
+    const int32_t input1_val = params.input1_offset + input1_data[i];
+    const int32_t input2_val = params.input2_offset + input2_data[i];
+    const int32_t unclamped_result =
        params.output_offset +
        MultiplyByQuantizedMultiplier(input1_val * input2_val,
                                      params.output_multiplier,
                                      params.output_shift);
-    const int32 clamped_output =
+    const int32_t clamped_output =
        std::min(params.quantized_activation_max,
                 std::max(params.quantized_activation_min, unclamped_result));
-    output_data[i] = static_cast<uint8>(clamped_output);
+    output_data[i] = static_cast<uint8_t>(clamped_output);
  }
 }

@ -60,9 +60,9 @@ inline void Mul(const ArithmeticParams& params,
 }

 inline void Mul(const ArithmeticParams& params,
-                const RuntimeShape& input1_shape, const uint8* input1_data,
-                const RuntimeShape& input2_shape, const uint8* input2_data,
-                const RuntimeShape& output_shape, uint8* output_data) {
+                const RuntimeShape& input1_shape, const uint8_t* input1_data,
+                const RuntimeShape& input2_shape, const uint8_t* input2_data,
+                const RuntimeShape& output_shape, uint8_t* output_data) {
  TFLITE_DCHECK_LE(params.quantized_activation_min,
                   params.quantized_activation_max);
  const int flat_size =
@ -73,11 +73,11 @@ inline void Mul(const ArithmeticParams& params,

 inline void BroadcastMul4DSlow(const ArithmeticParams& params,
                               const RuntimeShape& input1_shape,
-                               const uint8* input1_data,
+                               const uint8_t* input1_data,
                               const RuntimeShape& input2_shape,
-                               const uint8* input2_data,
+                               const uint8_t* input2_data,
                               const RuntimeShape& output_shape,
-                               uint8* output_data) {
+                               uint8_t* output_data) {
  NdArrayDesc<4> desc1;
  NdArrayDesc<4> desc2;
  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
@ -89,22 +89,22 @@ inline void BroadcastMul4DSlow(const ArithmeticParams& params,
    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
-          const int32 input1_val =
+          const int32_t input1_val =
              params.input1_offset +
              input1_data[SubscriptToIndex(desc1, b, y, x, c)];
-          const int32 input2_val =
+          const int32_t input2_val =
              params.input2_offset +
              input2_data[SubscriptToIndex(desc2, b, y, x, c)];
-          const int32 unclamped_result =
+          const int32_t unclamped_result =
              params.output_offset +
              MultiplyByQuantizedMultiplier(input1_val * input2_val,
                                            params.output_multiplier,
                                            params.output_shift);
-          const int32 clamped_output = std::min(
+          const int32_t clamped_output = std::min(
              params.quantized_activation_max,
              std::max(params.quantized_activation_min, unclamped_result));
          output_data[Offset(extended_output_shape, b, y, x, c)] =
-              static_cast<uint8>(clamped_output);
+              static_cast<uint8_t>(clamped_output);
        }
      }
    }
--- a/libs/tensorflow/tensorflow/lite/kernels/internal/reference/pad.h
+++ b/libs/tensorflow/tensorflow/lite/kernels/internal/reference/pad.h
@ -32,8 +32,8 @@ constexpr int PadKernelMaxDimensionCount() { return 4; }
 // equivalent to a simple input1_data.  For Pad, it should point to a zero
 // value.
 //
-// Note that two typenames are required, so that T=P=int32 is considered a
-// specialization distinct from P=int32.
+// Note that two typenames are required, so that T=P=int32_t is considered a
+// specialization distinct from P=int32_t.
 template <typename T, typename P>
 inline void PadImpl(const tflite::PadParams& op_params,
                    const RuntimeShape& input_shape, const T* input_data,
@ -116,11 +116,11 @@ inline void Pad(const tflite::PadParams& op_params,
          output_data);
 }

-// The second (pad-value) input can be int32 when, say, the first is uint8.
+// The second (pad-value) input can be int32_t when, say, the first is uint8_t.
 template <typename T>
 inline void Pad(const tflite::PadParams& op_params,
                const RuntimeShape& input_shape, const T* input_data,
-                const int32* pad_value_ptr, const RuntimeShape& output_shape,
+                const int32_t* pad_value_ptr, const RuntimeShape& output_shape,
                T* output_data) {
  const T converted_pad_value = static_cast<T>(*pad_value_ptr);
  PadImpl(op_params, input_shape, input_data, &converted_pad_value,
@ -130,40 +130,18 @@ inline void Pad(const tflite::PadParams& op_params,
 // This version avoids conflicting template matching.
 template <>
 inline void Pad(const tflite::PadParams& op_params,
-                const RuntimeShape& input_shape, const int32* input_data,
-                const int32* pad_value_ptr, const RuntimeShape& output_shape,
-                int32* output_data) {
+                const RuntimeShape& input_shape, const int32_t* input_data,
+                const int32_t* pad_value_ptr, const RuntimeShape& output_shape,
+                int32_t* output_data) {
  PadImpl(op_params, input_shape, input_data, pad_value_ptr, output_shape,
          output_data);
 }

-// One could make all PadImageStyle calls simply delegate the work to the
-// ordinary Pad.  However, it is better that the reference code asserts false in
-// similar cases.
 template <typename T, typename P>
 inline void PadImageStyle(const tflite::PadParams& op_params,
                          const RuntimeShape& input_shape, const T* input_data,
                          const P* pad_value_ptr,
                          const RuntimeShape& output_shape, T* output_data) {
-  TFLITE_ASSERT_FALSE;
-}
-
-template <typename P>
-inline void PadImageStyle(const tflite::PadParams& op_params,
-                          const RuntimeShape& input_shape,
-                          const uint8* input_data, const P* pad_value_ptr,
-                          const RuntimeShape& output_shape,
-                          uint8* output_data) {
-  Pad(op_params, input_shape, input_data, pad_value_ptr, output_shape,
-      output_data);
-}
-
-template <typename P>
-inline void PadImageStyle(const tflite::PadParams& op_params,
-                          const RuntimeShape& input_shape,
-                          const int8_t* input_data, const P* pad_value_ptr,
-                          const RuntimeShape& output_shape,
-                          int8_t* output_data) {
  Pad(op_params, input_shape, input_data, pad_value_ptr, output_shape,
      output_data);
 }
--- a/libs/tensorflow/tensorflow/lite/kernels/internal/reference/pooling.h
+++ b/libs/tensorflow/tensorflow/lite/kernels/internal/reference/pooling.h
@ -78,8 +78,9 @@ inline void AveragePool(const PoolParams& params,

 inline void AveragePool(const PoolParams& params,
                        const RuntimeShape& input_shape,
-                        const uint8* input_data,
-                        const RuntimeShape& output_shape, uint8* output_data) {
+                        const uint8_t* input_data,
+                        const RuntimeShape& output_shape,
+                        uint8_t* output_data) {
  TFLITE_DCHECK_LE(params.quantized_activation_min,
                   params.quantized_activation_max);
  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
@ -108,7 +109,7 @@ inline void AveragePool(const PoolParams& params,
          const int filter_y_start = std::max(0, -in_y_origin);
          const int filter_y_end =
              std::min(params.filter_height, input_height - in_y_origin);
-          int32 acc = 0;
+          int32_t acc = 0;
          int filter_count = 0;
          for (int filter_y = filter_y_start; filter_y < filter_y_end;
               ++filter_y) {
@ -125,7 +126,7 @@ inline void AveragePool(const PoolParams& params,
          acc = std::max(acc, params.quantized_activation_min);
          acc = std::min(acc, params.quantized_activation_max);
          output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
-              static_cast<uint8>(acc);
+              static_cast<uint8_t>(acc);
        }
      }
    }
@ -237,8 +238,8 @@ inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
 }

 inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
-                    const uint8* input_data, const RuntimeShape& output_shape,
-                    uint8* output_data) {
+                    const uint8_t* input_data, const RuntimeShape& output_shape,
+                    uint8_t* output_data) {
  TFLITE_DCHECK_LE(params.quantized_activation_min,
                   params.quantized_activation_max);
  TFLITE_DCHECK_GE(params.quantized_activation_min, 0);
@ -269,7 +270,7 @@ inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
          const int filter_y_start = std::max(0, -in_y_origin);
          const int filter_y_end =
              std::min(params.filter_height, input_height - in_y_origin);
-          uint8 max = 0;
+          uint8_t max = 0;
          for (int filter_y = filter_y_start; filter_y < filter_y_end;
               ++filter_y) {
            for (int filter_x = filter_x_start; filter_x < filter_x_end;
@ -281,10 +282,10 @@ inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
                  input_data[Offset(input_shape, batch, in_y, in_x, channel)]);
            }
          }
-          max = std::max<uint8>(max, params.quantized_activation_min);
-          max = std::min<uint8>(max, params.quantized_activation_max);
+          max = std::max<uint8_t>(max, params.quantized_activation_min);
+          max = std::min<uint8_t>(max, params.quantized_activation_max);
          output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
-              static_cast<uint8>(max);
+              static_cast<uint8_t>(max);
        }
      }
    }
--- a/libs/tensorflow/tensorflow/lite/kernels/internal/reference/prelu.h
+++ b/libs/tensorflow/tensorflow/lite/kernels/internal/reference/prelu.h
@ -23,7 +23,7 @@ namespace tflite {

 namespace reference_ops {

-// Broadcast prelu to output_shape for quantized uint8/int8 data.
+// Broadcast prelu to output_shape for quantized uint8_t/int8_t data.
 template <typename T>
 inline void BroadcastPrelu4DSlow(
    const PreluParams& params, const RuntimeShape& input_shape,
@ -44,15 +44,15 @@ inline void BroadcastPrelu4DSlow(
        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
          int output_index = Offset(extended_output_shape, b, y, x, c);
          int input_index = SubscriptToIndex(desc1, b, y, x, c);
-          const int32 input_value =
+          const int32_t input_value =
              params.input_offset + input_data[input_index];
-          int32 output_value;
+          int32_t output_value;
          if (input_value >= 0) {
            output_value = MultiplyByQuantizedMultiplier(
                input_value, params.output_multiplier_1, params.output_shift_1);
          } else {
            auto alpha_index = SubscriptToIndex(desc2, b, y, x, c);
-            const int32 alpha_value =
+            const int32_t alpha_value =
                params.alpha_offset + alpha_data[alpha_index];

            output_value = MultiplyByQuantizedMultiplier(
@ -61,9 +61,9 @@ inline void BroadcastPrelu4DSlow(
          }
          output_value += params.output_offset;

-          const int32 quantized_min = std::numeric_limits<T>::min();
-          const int32 quantized_max = std::numeric_limits<T>::max();
-          const int32 clamped_output =
+          const int32_t quantized_min = std::numeric_limits<T>::min();
+          const int32_t quantized_max = std::numeric_limits<T>::max();
+          const int32_t clamped_output =
              std::min(quantized_max, std::max(quantized_min, output_value));
          output_data[output_index] = static_cast<T>(clamped_output);
        }
@ -72,6 +72,37 @@ inline void BroadcastPrelu4DSlow(
  }
 }

+template <typename T>
+inline void Prelu(const PreluParams& params, const RuntimeShape& input_shape,
+                  const T* input_data, const RuntimeShape& alpha_shape,
+                  const T* alpha_data, const RuntimeShape& output_shape,
+                  T* output_data) {
+  const int32_t quantized_min = std::numeric_limits<T>::min();
+  const int32_t quantized_max = std::numeric_limits<T>::max();
+
+  const int flat_size =
+      MatchingElementsSize(input_shape, alpha_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i) {
+    const int32_t input_value = params.input_offset + input_data[i];
+    int32_t output_value;
+    if (input_value >= 0) {
+      output_value = MultiplyByQuantizedMultiplier(
+          input_value, params.output_multiplier_1, params.output_shift_1);
+    } else {
+      const int32_t alpha_value = params.alpha_offset + alpha_data[i];
+
+      output_value = MultiplyByQuantizedMultiplier(input_value * alpha_value,
+                                                   params.output_multiplier_2,
+                                                   params.output_shift_2);
+    }
+    output_value += params.output_offset;
+
+    const int32_t clamped_output =
+        std::min(quantized_max, std::max(quantized_min, output_value));
+    output_data[i] = static_cast<T>(clamped_output);
+  }
+}
+
 }  // namespace reference_ops
 }  // namespace tflite

--- a/libs/tensorflow/tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h
+++ b/libs/tensorflow/tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h
@ -76,6 +76,10 @@ inline bool ProcessBroadcastShapes(const RuntimeShape& shape0,
          BroadcastableOpCategory::kFirstInputBroadcastsFast &&
      params->broadcast_category !=
          BroadcastableOpCategory::kSecondInputBroadcastsFast) {
+    // This is unreachable because at least one else clause in the above loop
+    // must be reached.
+    TFLITE_DCHECK(false);
+    params->broadcast_category = BroadcastableOpCategory::kNonBroadcast;
    return false;
  }

--- a/libs/tensorflow/tensorflow/lite/kernels/internal/reference/quantize.h
+++ b/libs/tensorflow/tensorflow/lite/kernels/internal/reference/quantize.h
@ -15,7 +15,11 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_QUANTIZE_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_QUANTIZE_H_

+#include <algorithm>
+#include <limits>
+
 #include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/cppmath.h"
 #include "tensorflow/lite/kernels/internal/types.h"

@ -29,18 +33,18 @@ inline void AffineQuantize(const tflite::QuantizationParams& op_params,
                           const InputT* input_data,
                           const RuntimeShape& output_shape,
                           OutputT* output_data) {
-  const int32 zero_point = op_params.zero_point;
+  const int32_t zero_point = op_params.zero_point;
  const double scale = op_params.scale;
  const int flat_size = MatchingFlatSize(input_shape, output_shape);
-  static constexpr int32 min_val = std::numeric_limits<OutputT>::min();
-  static constexpr int32 max_val = std::numeric_limits<OutputT>::max();
+  static constexpr int32_t min_val = std::numeric_limits<OutputT>::min();
+  static constexpr int32_t max_val = std::numeric_limits<OutputT>::max();

  for (int i = 0; i < flat_size; i++) {
    const InputT val = input_data[i];
-    int32 unclamped =
-        static_cast<int32>(TfLiteRound(val / static_cast<float>(scale))) +
+    int32_t unclamped =
+        static_cast<int32_t>(TfLiteRound(val / static_cast<float>(scale))) +
        zero_point;
-    int32 clamped = std::min(std::max(unclamped, min_val), max_val);
+    int32_t clamped = std::min(std::max(unclamped, min_val), max_val);
    output_data[i] = clamped;
  }
 }
--- a/libs/tensorflow/tensorflow/lite/kernels/internal/reference/reduce.h
+++ b/libs/tensorflow/tensorflow/lite/kernels/internal/reference/reduce.h
@ -18,6 +18,8 @@ limitations under the License.
 #include "ruy/profiler/instrumentation.h"  // from @ruy
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/cppmath.h"
+#include "tensorflow/lite/kernels/internal/max.h"
+#include "tensorflow/lite/kernels/internal/min.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/types.h"

@ -249,9 +251,9 @@ inline void Mean(const tflite::MeanParams& op_params,

 inline void Mean(const tflite::MeanParams& op_params,
                 const RuntimeShape& unextended_input_shape,
-                 const uint8_t* input_data, int32 input_zero_point,
+                 const uint8_t* input_data, int32_t input_zero_point,
                 float input_scale, const RuntimeShape& unextended_output_shape,
-                 uint8_t* output_data, int32 output_zero_point,
+                 uint8_t* output_data, int32_t output_zero_point,
                 float output_scale) {
  ruy::profiler::ScopeLabel label("Mean4D/Uint8");

@ -280,9 +282,9 @@ inline void Mean(const tflite::MeanParams& op_params,
  constexpr int32_t kMinValue = std::numeric_limits<uint8_t>::min();
  constexpr int32_t kMaxValue = std::numeric_limits<uint8_t>::max();

-  int32 bias =
+  int32_t bias =
      output_zero_point -
-      static_cast<int32>(input_zero_point * input_scale / output_scale);
+      static_cast<int32_t>(input_zero_point * input_scale / output_scale);
  double real_scale =
      static_cast<double>(input_scale / (num_elements_in_axis * output_scale));

@ -291,7 +293,7 @@ inline void Mean(const tflite::MeanParams& op_params,
  QuantizeMultiplier(real_scale, &multiplier, &shift);
  for (int out_b = 0; out_b < output_batch; ++out_b) {
    for (int out_d = 0; out_d < output_depth; ++out_d) {
-      int32 acc = 0;
+      int32_t acc = 0;
      for (int in_h = 0; in_h < input_height; ++in_h) {
        for (int in_w = 0; in_w < input_width; ++in_w) {
          acc += input_data[Offset(input_shape, out_b, in_h, in_w, out_d)];
@ -310,18 +312,21 @@ inline void Mean(const tflite::MeanParams& op_params,
 // It does so in two stages, first calculates the sum of elements along the axis
 // then divides it by the number of element in axis for quantized values.
 template <typename T, typename U>
-inline bool QuantizedMeanOrSum(const T* input_data, int32 input_zero_point,
+inline bool QuantizedMeanOrSum(const T* input_data, int32_t input_zero_point,
                               float input_scale, const int* input_dims,
                               const int input_num_dims, T* output_data,
-                               int32 output_zero_point, float output_scale,
+                               int32_t output_zero_point, float output_scale,
                               const int* output_dims,
                               const int output_num_dims, const int* axis,
                               const int num_axis_dimensions, bool keep_dims,
                               int* temp_index, int* resolved_axis, U* temp_sum,
                               bool compute_sum) {
-  const bool uint8_case = std::is_same<T, int8_t>::value;
+  const bool uint8_case = std::is_same<T, uint8_t>::value;
+  const bool int16_case = std::is_same<T, int16_t>::value;
  if (uint8_case) {
    ruy::profiler::ScopeLabel label(compute_sum ? "Sum/Uint8" : "Mean/Uint8");
+  } else if (int16_case) {
+    ruy::profiler::ScopeLabel label(compute_sum ? "Sum/Int16" : "Mean/Int16");
  } else {
    ruy::profiler::ScopeLabel label(compute_sum ? "Sum/Int8" : "Mean/Int8");
  }
@ -381,11 +386,11 @@ inline bool QuantizedMeanOrSum(const T* input_data, int32 input_zero_point,
      for (size_t idx = 0; idx < num_outputs; ++idx) {
        float float_mean = static_cast<float>(temp_sum[idx]) /
                           static_cast<float>(num_elements_in_axis);
-        float result =
-            std::min(TfLiteRound(float_mean * scale + bias) + output_zero_point,
-                     static_cast<float>(std::numeric_limits<T>::max()));
-        result =
-            std::max(result, static_cast<float>(std::numeric_limits<T>::min()));
+        float result = TfLiteMin(
+            TfLiteRound(float_mean * scale + bias) + output_zero_point,
+            static_cast<float>(std::numeric_limits<T>::max()));
+        result = TfLiteMax(result,
+                           static_cast<float>(std::numeric_limits<T>::min()));
        output_data[idx] = static_cast<T>(result);
      }
    }
--- a/libs/tensorflow/tensorflow/lite/kernels/internal/reference/resize_nearest_neighbor.h
+++ b/libs/tensorflow/tensorflow/lite/kernels/internal/reference/resize_nearest_neighbor.h
@ -17,28 +17,30 @@ limitations under the License.

 #include <cmath>

+#include "tensorflow/lite/kernels/internal/cppmath.h"
 #include "tensorflow/lite/kernels/internal/types.h"

 namespace tflite {

 namespace reference_ops {

-inline int32 GetNearestNeighbor(const int input_value, const int32 input_size,
-                                const int32 output_size,
-                                const bool align_corners,
-                                const bool half_pixel_centers) {
+inline int32_t GetNearestNeighbor(const int input_value,
+                                  const int32_t input_size,
+                                  const int32_t output_size,
+                                  const bool align_corners,
+                                  const bool half_pixel_centers) {
  const float scale =
      (align_corners && output_size > 1)
          ? (input_size - 1) / static_cast<float>(output_size - 1)
          : input_size / static_cast<float>(output_size);
  const float offset = half_pixel_centers ? 0.5f : 0.0f;
-  int32 output_value = std::min(
+  int32_t output_value = std::min(
      align_corners
-          ? static_cast<int32>(std::round((input_value + offset) * scale))
-          : static_cast<int32>(std::floor((input_value + offset) * scale)),
+          ? static_cast<int32_t>(TfLiteRound((input_value + offset) * scale))
+          : static_cast<int32_t>(std::floor((input_value + offset) * scale)),
      input_size - 1);
  if (half_pixel_centers) {
-    output_value = std::max(static_cast<int32>(0), output_value);
+    output_value = std::max(static_cast<int32_t>(0), output_value);
  }
  return output_value;
 }
@ -47,7 +49,7 @@ template <typename T>
 inline void ResizeNearestNeighbor(
    const tflite::ResizeNearestNeighborParams& op_params,
    const RuntimeShape& unextended_input_shape, const T* input_data,
-    const RuntimeShape& output_size_shape, const int32* output_size_data,
+    const RuntimeShape& output_size_shape, const int32_t* output_size_data,
    const RuntimeShape& unextended_output_shape, T* output_data) {
  TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
@ -57,16 +59,16 @@ inline void ResizeNearestNeighbor(
  const RuntimeShape output_shape =
      RuntimeShape::ExtendedShape(4, unextended_output_shape);

-  int32 batches = MatchingDim(input_shape, 0, output_shape, 0);
-  int32 input_height = input_shape.Dims(1);
-  int32 input_width = input_shape.Dims(2);
-  int32 depth = MatchingDim(input_shape, 3, output_shape, 3);
+  int32_t batches = MatchingDim(input_shape, 0, output_shape, 0);
+  int32_t input_height = input_shape.Dims(1);
+  int32_t input_width = input_shape.Dims(2);
+  int32_t depth = MatchingDim(input_shape, 3, output_shape, 3);

  // The Tensorflow version of this op allows resize on the width and height
  // axis only.
  TFLITE_DCHECK_EQ(output_size_shape.FlatSize(), 2);
-  int32 output_height = output_size_data[0];
-  int32 output_width = output_size_data[1];
+  int32_t output_height = output_size_data[0];
+  int32_t output_width = output_size_data[1];

  const int col_offset = input_shape.Dims(3);
  const int row_offset = input_shape.Dims(2) * col_offset;
@ -76,14 +78,14 @@ inline void ResizeNearestNeighbor(
  T* output_ptr = output_data;
  for (int b = 0; b < batches; ++b) {
    for (int y = 0; y < output_height; ++y) {
-      int32 in_y = GetNearestNeighbor(y, input_height, output_height,
-                                      op_params.align_corners,
-                                      op_params.half_pixel_centers);
-      const T* y_input_ptr = input_ptr + in_y * row_offset;
-      for (int x = 0; x < output_width; ++x) {
-        int32 in_x = GetNearestNeighbor(x, input_width, output_width,
+      int32_t in_y = GetNearestNeighbor(y, input_height, output_height,
                                        op_params.align_corners,
                                        op_params.half_pixel_centers);
+      const T* y_input_ptr = input_ptr + in_y * row_offset;
+      for (int x = 0; x < output_width; ++x) {
+        int32_t in_x = GetNearestNeighbor(x, input_width, output_width,
+                                          op_params.align_corners,
+                                          op_params.half_pixel_centers);
        const T* x_input_ptr = y_input_ptr + in_x * col_offset;
        memcpy(output_ptr, x_input_ptr, depth * sizeof(T));
        output_ptr += depth;
--- a/libs/tensorflow/tensorflow/lite/kernels/internal/reference/softmax.h
+++ b/libs/tensorflow/tensorflow/lite/kernels/internal/reference/softmax.h
@ -62,13 +62,14 @@ inline void Softmax(const SoftmaxParams& params,
  }
 }

-// Quantized softmax with int8/uint8 input and int8/uint8/int16 output.
+// Quantized softmax with int8_t/uint8_t input and int8_t/uint8_t/int16_t
+// output.
 template <typename InputT, typename OutputT>
 inline void Softmax(const SoftmaxParams& params,
                    const RuntimeShape& input_shape, const InputT* input_data,
                    const RuntimeShape& output_shape, OutputT* output_data) {
-  const int32 input_beta_multiplier = params.input_multiplier;
-  const int32 input_beta_left_shift = params.input_left_shift;
+  const int32_t input_beta_multiplier = params.input_multiplier;
+  const int32_t input_beta_left_shift = params.input_left_shift;
  const int diff_min = params.diff_min;
  // The representation chosen for the input to the exp() function is Q5.26.
  // We need to leave extra space since values that we skip might be as large as
@ -78,9 +79,10 @@ inline void Softmax(const SoftmaxParams& params,
  static const int kScaledDiffIntegerBits = 5;
  static const int kAccumulationIntegerBits = 12;
  using FixedPointScaledDiff =
-      gemmlowp::FixedPoint<int32, kScaledDiffIntegerBits>;
-  using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>;
-  using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
+      gemmlowp::FixedPoint<int32_t, kScaledDiffIntegerBits>;
+  using FixedPointAccum =
+      gemmlowp::FixedPoint<int32_t, kAccumulationIntegerBits>;
+  using FixedPoint0 = gemmlowp::FixedPoint<int32_t, 0>;

  const int trailing_dim = input_shape.DimensionsCount() - 1;
  const int outer_size =
@ -96,10 +98,10 @@ inline void Softmax(const SoftmaxParams& params,

    FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
    for (int c = 0; c < depth; ++c) {
-      int32 input_diff =
-          static_cast<int32>(input_data[i * depth + c]) - max_in_row;
+      int32_t input_diff =
+          static_cast<int32_t>(input_data[i * depth + c]) - max_in_row;
      if (input_diff >= diff_min) {
-        const int32 input_diff_rescaled =
+        const int32_t input_diff_rescaled =
            MultiplyByQuantizedMultiplierGreaterThanOne(
                input_diff, input_beta_multiplier, input_beta_left_shift);
        const FixedPointScaledDiff scaled_diff_f8 =
@ -114,28 +116,28 @@ inline void Softmax(const SoftmaxParams& params,
        sum_of_exps.raw(), kAccumulationIntegerBits, &num_bits_over_unit));

    for (int c = 0; c < depth; ++c) {
-      int32 input_diff =
-          static_cast<int32>(input_data[i * depth + c]) - max_in_row;
+      int32_t input_diff =
+          static_cast<int32_t>(input_data[i * depth + c]) - max_in_row;
      if (input_diff >= diff_min) {
-        const int32 input_diff_rescaled =
+        const int32_t input_diff_rescaled =
            MultiplyByQuantizedMultiplierGreaterThanOne(
                input_diff, input_beta_multiplier, input_beta_left_shift);
        const FixedPointScaledDiff scaled_diff_f8 =
            FixedPointScaledDiff::FromRaw(input_diff_rescaled);

        FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8);
-        int32 unsat_output = gemmlowp::RoundingDivideByPOT(
+        int32_t unsat_output = gemmlowp::RoundingDivideByPOT(
            (shifted_scale * exp_in_0).raw(),
            num_bits_over_unit + 31 - (sizeof(OutputT) * 8));

-        const int32 shifted_output =
+        const int32_t shifted_output =
            unsat_output +
-            static_cast<int32>(std::numeric_limits<OutputT>::min());
+            static_cast<int32_t>(std::numeric_limits<OutputT>::min());

        output_data[i * depth + c] = static_cast<OutputT>(std::max(
            std::min(shifted_output,
-                     static_cast<int32>(std::numeric_limits<OutputT>::max())),
-            static_cast<int32>(std::numeric_limits<OutputT>::min())));
+                     static_cast<int32_t>(std::numeric_limits<OutputT>::max())),
+            static_cast<int32_t>(std::numeric_limits<OutputT>::min())));
      } else {
        output_data[i * depth + c] = std::numeric_limits<OutputT>::min();
      }
@ -143,7 +145,7 @@ inline void Softmax(const SoftmaxParams& params,
  }
 }

-// Quantized softmax with int16 input and int16 output.
+// Quantized softmax with int16_t input and int16_t output.
 inline void SoftmaxInt16(const SoftmaxParams& params,
                         const RuntimeShape& input_shape,
                         const int16_t* input_data,
--- a/libs/tensorflow/tensorflow/lite/kernels/internal/reference/strided_slice.h
+++ b/libs/tensorflow/tensorflow/lite/kernels/internal/reference/strided_slice.h
@ -16,8 +16,10 @@ limitations under the License.
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_STRIDED_SLICE_H_

 #include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/strided_slice_logic.h"
 #include "tensorflow/lite/kernels/internal/types.h"
+
 namespace tflite {

 namespace reference_ops {
--- a/libs/tensorflow/tensorflow/lite/kernels/internal/reference/sub.h
+++ b/libs/tensorflow/tensorflow/lite/kernels/internal/reference/sub.h
@ -15,9 +15,15 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SUB_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SUB_H_

-#include "fixedpoint/fixedpoint.h"
+#include <stdint.h>
+
+#include <algorithm>
+#include <limits>
+
 #include "ruy/profiler/instrumentation.h"  // from @ruy
 #include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/types.h"

 namespace tflite {

@ -41,11 +47,11 @@ inline void SubNonBroadcast(const ArithmeticParams& params,

 inline void SubNonBroadcast(const ArithmeticParams& params,
                            const RuntimeShape& input1_shape,
-                            const int32* input1_data,
+                            const int32_t* input1_data,
                            const RuntimeShape& input2_shape,
-                            const int32* input2_data,
+                            const int32_t* input2_data,
                            const RuntimeShape& output_shape,
-                            int32* output_data) {
+                            int32_t* output_data) {
  const int flat_size =
      MatchingElementsSize(input1_shape, input2_shape, output_shape);
  for (int i = 0; i < flat_size; ++i) {
@ -106,12 +112,12 @@ inline void BroadcastSubSlow(const ArithmeticParams& params,
 template <int N = 5>
 inline void BroadcastSubSlow(const ArithmeticParams& params,
                             const RuntimeShape& input1_shape,
-                             const uint8* input1_data,
+                             const uint8_t* input1_data,
                             const RuntimeShape& input2_shape,
-                             const uint8* input2_data,
+                             const uint8_t* input2_data,
                             const RuntimeShape& output_shape,
-                             uint8* output_data) {
-  ruy::profiler::ScopeLabel label("BroadcastSubSlow/uint8");
+                             uint8_t* output_data) {
+  ruy::profiler::ScopeLabel label("BroadcastSubSlow/uint8_t");
  TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
  TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
  TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
@ -134,28 +140,28 @@ inline void BroadcastSubSlow(const ArithmeticParams& params,
  // nesting loops such that the innermost loop has the smallest stride for the
  // best cache behavior.
  auto sub_func = [&](int indexes[N]) {
-    const int32 input1_val =
+    const int32_t input1_val =
        params.input1_offset + input1_data[SubscriptToIndex(desc1, indexes)];
-    const int32 input2_val =
+    const int32_t input2_val =
        params.input2_offset + input2_data[SubscriptToIndex(desc2, indexes)];
-    const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
-    const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
-    const int32 scaled_input1_val =
+    const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
+    const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
+    const int32_t scaled_input1_val =
        MultiplyByQuantizedMultiplierSmallerThanOneExp(
            shifted_input1_val, params.input1_multiplier, params.input1_shift);
-    const int32 scaled_input2_val =
+    const int32_t scaled_input2_val =
        MultiplyByQuantizedMultiplierSmallerThanOneExp(
            shifted_input2_val, params.input2_multiplier, params.input2_shift);
-    const int32 raw_sub = scaled_input1_val - scaled_input2_val;
-    const int32 raw_output =
+    const int32_t raw_sub = scaled_input1_val - scaled_input2_val;
+    const int32_t raw_output =
        MultiplyByQuantizedMultiplierSmallerThanOneExp(
            raw_sub, params.output_multiplier, params.output_shift) +
        params.output_offset;
-    const int32 clamped_output =
+    const int32_t clamped_output =
        std::min(params.quantized_activation_max,
                 std::max(params.quantized_activation_min, raw_output));
    output_data[SubscriptToIndex(output_desc, indexes)] =
-        static_cast<uint8>(clamped_output);
+        static_cast<uint8_t>(clamped_output);
  };
  NDOpsHelper<N>(output_desc, sub_func);
 }
@ -163,12 +169,12 @@ inline void BroadcastSubSlow(const ArithmeticParams& params,
 template <int N = 5>
 inline void BroadcastSubSlow(const ArithmeticParams& params,
                             const RuntimeShape& input1_shape,
-                             const int32* input1_data,
+                             const int32_t* input1_data,
                             const RuntimeShape& input2_shape,
-                             const int32* input2_data,
+                             const int32_t* input2_data,
                             const RuntimeShape& output_shape,
-                             int32* output_data) {
-  ruy::profiler::ScopeLabel label("BroadcastSubSlow/int32");
+                             int32_t* output_data) {
+  ruy::profiler::ScopeLabel label("BroadcastSubSlow/int32_t");
  TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
  TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
  TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
@ -208,7 +214,7 @@ inline void BroadcastSubSlow(const ArithmeticParams& params,
                             const int8_t* input2_data,
                             const RuntimeShape& output_shape,
                             int8_t* output_data) {
-  ruy::profiler::ScopeLabel label("BroadcastSubSlow/int8");
+  ruy::profiler::ScopeLabel label("BroadcastSubSlow/int8_t");
  NdArrayDesc<N> desc1;
  NdArrayDesc<N> desc2;
  NdArrayDesc<N> output_desc;
@ -254,6 +260,45 @@ inline void BroadcastSubSlow(const ArithmeticParams& params,
  NDOpsHelper<N>(output_desc, sub_func);
 }

+template <int N = 5>
+void BroadcastSubSlow(const ArithmeticParams& params,
+                      const RuntimeShape& input1_shape,
+                      const int64_t* input1_data,
+                      const RuntimeShape& input2_shape,
+                      const int64_t* input2_data,
+                      const RuntimeShape& output_shape, int64_t* output_data) {
+  ruy::profiler::ScopeLabel label("BroadcastSubSlow/int64_t");
+  TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N);
+  TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N);
+  TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N);
+  NdArrayDesc<N> desc1;
+  NdArrayDesc<N> desc2;
+  NdArrayDesc<N> output_desc;
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
+                                      &desc2);
+  CopyDimsToDesc(RuntimeShape::ExtendedShape(N, output_shape), &output_desc);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  auto sub_func = [&](int indexes[N]) {
+    output_data[SubscriptToIndex(output_desc, indexes)] =
+        ActivationFunctionWithMinMax(
+            input1_data[SubscriptToIndex(desc1, indexes)] -
+                input2_data[SubscriptToIndex(desc2, indexes)],
+            params.int64_activation_min, params.int64_activation_max);
+  };
+  NDOpsHelper<N>(output_desc, sub_func);
+}
+
 template <typename T, int N = 5>
 void BroadcastSubSlow(const ArithmeticParams& params,
                      const RuntimeShape& input1_shape, const T* input1_data,
@ -294,33 +339,33 @@ void BroadcastSubSlow(const ArithmeticParams& params,
 // Element-wise Sub that can often be used for inner loop of broadcast sub as
 // well as the non-broadcast sub.
 inline void SubElementwise(int size, const ArithmeticParams& params,
-                           const uint8* input1_data, const uint8* input2_data,
-                           uint8* output_data) {
+                           const uint8_t* input1_data,
+                           const uint8_t* input2_data, uint8_t* output_data) {
  TFLITE_DCHECK_GT(params.input1_offset, -256);
  TFLITE_DCHECK_GT(params.input2_offset, -256);
  TFLITE_DCHECK_LT(params.input1_offset, 256);
  TFLITE_DCHECK_LT(params.input2_offset, 256);

  for (int i = 0; i < size; ++i) {
-    const int32 input1_val = params.input1_offset + input1_data[i];
-    const int32 input2_val = params.input2_offset + input2_data[i];
-    const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
-    const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
-    const int32 scaled_input1_val =
+    const int32_t input1_val = params.input1_offset + input1_data[i];
+    const int32_t input2_val = params.input2_offset + input2_data[i];
+    const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
+    const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
+    const int32_t scaled_input1_val =
        MultiplyByQuantizedMultiplierSmallerThanOneExp(
            shifted_input1_val, params.input1_multiplier, params.input1_shift);
-    const int32 scaled_input2_val =
+    const int32_t scaled_input2_val =
        MultiplyByQuantizedMultiplierSmallerThanOneExp(
            shifted_input2_val, params.input2_multiplier, params.input2_shift);
-    const int32 raw_sub = scaled_input1_val - scaled_input2_val;
-    const int32 raw_output =
+    const int32_t raw_sub = scaled_input1_val - scaled_input2_val;
+    const int32_t raw_output =
        MultiplyByQuantizedMultiplierSmallerThanOneExp(
            raw_sub, params.output_multiplier, params.output_shift) +
        params.output_offset;
-    const int32 clamped_output =
+    const int32_t clamped_output =
        std::min(params.quantized_activation_max,
                 std::max(params.quantized_activation_min, raw_output));
-    output_data[i] = static_cast<uint8>(clamped_output);
+    output_data[i] = static_cast<uint8_t>(clamped_output);
  }
 }

@ -336,22 +381,22 @@ inline void SubElementwise(int size, const ArithmeticParams& params,
  TFLITE_DCHECK_LE(params.input2_offset, int8_max_value);

  for (int i = 0; i < size; ++i) {
-    const int32 input1_val = params.input1_offset + input1_data[i];
-    const int32 input2_val = params.input2_offset + input2_data[i];
-    const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
-    const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
-    const int32 scaled_input1_val =
+    const int32_t input1_val = params.input1_offset + input1_data[i];
+    const int32_t input2_val = params.input2_offset + input2_data[i];
+    const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
+    const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
+    const int32_t scaled_input1_val =
        MultiplyByQuantizedMultiplierSmallerThanOneExp(
            shifted_input1_val, params.input1_multiplier, params.input1_shift);
-    const int32 scaled_input2_val =
+    const int32_t scaled_input2_val =
        MultiplyByQuantizedMultiplierSmallerThanOneExp(
            shifted_input2_val, params.input2_multiplier, params.input2_shift);
-    const int32 raw_sub = scaled_input1_val - scaled_input2_val;
-    const int32 raw_output =
+    const int32_t raw_sub = scaled_input1_val - scaled_input2_val;
+    const int32_t raw_output =
        MultiplyByQuantizedMultiplierSmallerThanOneExp(
            raw_sub, params.output_multiplier, params.output_shift) +
        params.output_offset;
-    const int32 clamped_output =
+    const int32_t clamped_output =
        std::min(params.quantized_activation_max,
                 std::max(params.quantized_activation_min, raw_output));
    output_data[i] = static_cast<int8_t>(clamped_output);
@ -359,9 +404,9 @@ inline void SubElementwise(int size, const ArithmeticParams& params,
 }

 inline void Sub(const ArithmeticParams& params,
-                const RuntimeShape& input1_shape, const uint8* input1_data,
-                const RuntimeShape& input2_shape, const uint8* input2_data,
-                const RuntimeShape& output_shape, uint8* output_data) {
+                const RuntimeShape& input1_shape, const uint8_t* input1_data,
+                const RuntimeShape& input2_shape, const uint8_t* input2_data,
+                const RuntimeShape& output_shape, uint8_t* output_data) {
  TFLITE_DCHECK_LE(params.quantized_activation_min,
                   params.quantized_activation_max);
  const int flat_size =
@ -428,40 +473,43 @@ void Sub(const ArithmeticParams& params, const RuntimeShape& input1_shape,
  }
 }

-inline void SubWithActivation(const ArithmeticParams& params,
-                              const RuntimeShape& input1_shape,
-                              const int32* input1_data,
-                              const RuntimeShape& input2_shape,
-                              const int32* input2_data,
-                              const RuntimeShape& output_shape,
-                              int32* output_data) {
+inline void SetActivationMinMax(const ArithmeticParams& params,
+                                int32_t* activation_min,
+                                int32_t* activation_max) {
+  *activation_min = params.quantized_activation_min;
+  *activation_max = params.quantized_activation_max;
+}
+
+inline void SetActivationMinMax(const ArithmeticParams& params,
+                                float* activation_min, float* activation_max) {
+  *activation_min = params.float_activation_min;
+  *activation_max = params.float_activation_max;
+}
+
+inline void SetActivationMinMax(const ArithmeticParams& params,
+                                int64_t* activation_min,
+                                int64_t* activation_max) {
+  *activation_min = params.int64_activation_min;
+  *activation_max = params.int64_activation_max;
+}
+
+template <typename T>
+inline void SubWithActivation(
+    const ArithmeticParams& params, const RuntimeShape& input1_shape,
+    const T* input1_data, const RuntimeShape& input2_shape,
+    const T* input2_data, const RuntimeShape& output_shape, T* output_data) {
  ruy::profiler::ScopeLabel label("SubWithActivation");
  const int flat_size =
      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+  T activation_min, activation_max;
+  SetActivationMinMax(params, &activation_min, &activation_max);
+
  for (int i = 0; i < flat_size; ++i) {
    output_data[i] = ActivationFunctionWithMinMax(
-        input1_data[i] - input2_data[i], params.quantized_activation_min,
-        params.quantized_activation_max);
+        input1_data[i] - input2_data[i], activation_min, activation_max);
  }
 }

-inline void SubWithActivation(const ArithmeticParams& params,
-                              const RuntimeShape& input1_shape,
-                              const float* input1_data,
-                              const RuntimeShape& input2_shape,
-                              const float* input2_data,
-                              const RuntimeShape& output_shape,
-                              float* output_data) {
-  const int flat_size =
-      MatchingElementsSize(input1_shape, input2_shape, output_shape);
-  for (int i = 0; i < flat_size; ++i) {
-    output_data[i] = ActivationFunctionWithMinMax(
-        input1_data[i] - input2_data[i], params.float_activation_min,
-        params.float_activation_max);
-  }
-}
-
-
 }  // namespace reference_ops
 }  // namespace tflite

--- a/libs/tensorflow/tensorflow/lite/kernels/internal/types.h
+++ b/libs/tensorflow/tensorflow/lite/kernels/internal/types.h
@ -24,24 +24,29 @@ limitations under the License.

 namespace tflite {

-enum class FusedActivationFunctionType : uint8 { kNone, kRelu6, kRelu1, kRelu };
-enum class PaddingType : uint8 { kNone, kSame, kValid };
+enum class FusedActivationFunctionType : uint8_t {
+  kNone,
+  kRelu6,
+  kRelu1,
+  kRelu
+};
+enum class PaddingType : uint8_t { kNone, kSame, kValid };

 struct PaddingValues {
-  int16 width;
-  int16 height;
+  int16_t width;
+  int16_t height;
  // offset is used for calculating "remaining" padding, for example, `width`
  // is 1 and `width_offset` is 1, so padding_left is 1 while padding_right is
  // 1 + 1 = 2.
-  int16 width_offset;
+  int16_t width_offset;
  // Same as width_offset except it's over the height dimension.
-  int16 height_offset;
+  int16_t height_offset;
 };

 // This enumeration allows for non-default formats for the weights array
 // of a fully-connected operator, allowing the use of special optimized
 // runtime paths.
-enum class FullyConnectedWeightsFormat : uint8 {
+enum class FullyConnectedWeightsFormat : uint8_t {
  // Default format (flat 2D layout, the inner contiguous dimension
  // is input_depth, the outer non-contiguous dimension is output_depth)
  kDefault,
@ -88,11 +93,11 @@ enum class FullyConnectedWeightsFormat : uint8 {
  //     maximize arithmetic throughput.
  //
  // Finally, the 'Int8' part in the name refers to the fact that this
-  // weights format has each weights value encoded as a signed int8 value,
-  // even if the data type of the weights buffer is uint8.  This is intended
+  // weights format has each weights value encoded as a signed int8_t value,
+  // even if the data type of the weights buffer is uint8_t.  This is intended
  // to save runtime kernels the effort to have to XOR the top bit of these
  // bytes before using them in signed arithmetic, see this file for more
-  // explanations on the 'signed int8 trick' in matrix multiplication kernels:
+  // explanations on the 'signed int8_t trick' in matrix multiplication kernels:
  //
  //   tensorflow/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
  //
@ -111,7 +116,7 @@ enum class FullyConnectedWeightsFormat : uint8 {
 // the real 0 value, and scale designates the difference between the real values
 // corresponding to consecutive quantized values differing by 1.
 struct QuantizationParams {
-  int32 zero_point = 0;
+  int32_t zero_point = 0;
  double scale = 0.0;
 };

@ -140,20 +145,20 @@ class RuntimeShape {
    if (dimensions_count > kMaxSmallSize) {
 #ifdef TF_LITE_STATIC_MEMORY
      TFLITE_CHECK(false && "No shape resizing supported on this platform");
-#else   // TF_LITE_STATIC_MEMORY
-      dims_pointer_ = new int32[dimensions_count];
+#else  // TF_LITE_STATIC_MEMORY
+      dims_pointer_ = new int32_t[dimensions_count];
 #endif  // TF_LITE_STATIC_MEMORY
    }
  }

-  RuntimeShape(int shape_size, int32 value) : size_(0) {
+  RuntimeShape(int shape_size, int32_t value) : size_(0) {
    Resize(shape_size);
    for (int i = 0; i < shape_size; ++i) {
      SetDim(i, value);
    }
  }

-  RuntimeShape(int dimensions_count, const int32* dims_data) : size_(0) {
+  RuntimeShape(int dimensions_count, const int32_t* dims_data) : size_(0) {
    ReplaceWith(dimensions_count, dims_data);
  }

@ -165,33 +170,34 @@ class RuntimeShape {
  // rolls out.
  RuntimeShape(RuntimeShape const& other) : size_(other.DimensionsCount()) {
    if (size_ > kMaxSmallSize) {
-      dims_pointer_ = new int32[size_];
+      dims_pointer_ = new int32_t[size_];
    }
-    std::memcpy(DimsData(), other.DimsData(), sizeof(int32) * size_);
+    std::memcpy(DimsData(), other.DimsData(), sizeof(int32_t) * size_);
  }

  bool operator==(const RuntimeShape& comp) const {
    return this->size_ == comp.size_ &&
-           std::memcmp(DimsData(), comp.DimsData(), size_ * sizeof(int32)) == 0;
+           std::memcmp(DimsData(), comp.DimsData(), size_ * sizeof(int32_t)) ==
+               0;
  }

  ~RuntimeShape() {
    if (size_ > kMaxSmallSize) {
 #ifdef TF_LITE_STATIC_MEMORY
      TFLITE_CHECK(false && "No shape resizing supported on this platform");
-#else   // TF_LITE_STATIC_MEMORY
+#else  // TF_LITE_STATIC_MEMORY
      delete[] dims_pointer_;
 #endif  // TF_LITE_STATIC_MEMORY
    }
  }

-  inline int32 DimensionsCount() const { return size_; }
-  inline int32 Dims(int i) const {
+  inline int32_t DimensionsCount() const { return size_; }
+  inline int32_t Dims(int i) const {
    TFLITE_DCHECK_GE(i, 0);
    TFLITE_DCHECK_LT(i, size_);
    return size_ > kMaxSmallSize ? dims_pointer_[i] : dims_[i];
  }
-  inline void SetDim(int i, int32 val) {
+  inline void SetDim(int i, int32_t val) {
    TFLITE_DCHECK_GE(i, 0);
    TFLITE_DCHECK_LT(i, size_);
    if (size_ > kMaxSmallSize) {
@ -201,20 +207,20 @@ class RuntimeShape {
    }
  }

-  inline int32* DimsData() {
+  inline int32_t* DimsData() {
    return size_ > kMaxSmallSize ? dims_pointer_ : dims_;
  }
-  inline const int32* DimsData() const {
+  inline const int32_t* DimsData() const {
    return size_ > kMaxSmallSize ? dims_pointer_ : dims_;
  }
  // The caller must ensure that the shape is no bigger than 5-D.
-  inline const int32* DimsDataUpTo5D() const { return dims_; }
+  inline const int32_t* DimsDataUpTo5D() const { return dims_; }

  inline void Resize(int dimensions_count) {
    if (size_ > kMaxSmallSize) {
 #ifdef TF_LITE_STATIC_MEMORY
      TFLITE_CHECK(false && "No shape resizing supported on this platform");
-#else   // TF_LITE_STATIC_MEMORY
+#else  // TF_LITE_STATIC_MEMORY
      delete[] dims_pointer_;
 #endif  // TF_LITE_STATIC_MEMORY
    }
@ -222,16 +228,16 @@ class RuntimeShape {
    if (dimensions_count > kMaxSmallSize) {
 #ifdef TF_LITE_STATIC_MEMORY
      TFLITE_CHECK(false && "No shape resizing supported on this platform");
-#else   // TF_LITE_STATIC_MEMORY
-      dims_pointer_ = new int32[dimensions_count];
+#else  // TF_LITE_STATIC_MEMORY
+      dims_pointer_ = new int32_t[dimensions_count];
 #endif  // TF_LITE_STATIC_MEMORY
    }
  }

-  inline void ReplaceWith(int dimensions_count, const int32* dims_data) {
+  inline void ReplaceWith(int dimensions_count, const int32_t* dims_data) {
    Resize(dimensions_count);
-    int32* dst_dims = DimsData();
-    std::memcpy(dst_dims, dims_data, dimensions_count * sizeof(int32));
+    int32_t* dst_dims = DimsData();
+    std::memcpy(dst_dims, dims_data, dimensions_count * sizeof(int32_t));
  }

  template <typename T>
@ -239,7 +245,7 @@ class RuntimeShape {
    const int dimensions_count =
        std::distance(src_iterable.begin(), src_iterable.end());
    Resize(dimensions_count);
-    int32* data = DimsData();
+    int32_t* data = DimsData();
    for (auto it : src_iterable) {
      *data = it;
      ++data;
@ -288,13 +294,13 @@ class RuntimeShape {
      SetDim(i, pad_value);
    }
    std::memcpy(DimsData() + size_increase, shape.DimsData(),
-                sizeof(int32) * shape.DimensionsCount());
+                sizeof(int32_t) * shape.DimensionsCount());
  }

-  int32 size_;
+  int32_t size_;
  union {
-    int32 dims_[kMaxSmallSize];
-    int32* dims_pointer_;
+    int32_t dims_[kMaxSmallSize];
+    int32_t* dims_pointer_;
  };
 };

@ -713,7 +719,7 @@ void ComputeStrides(Dims<N>* dims) {
  }
 }

-enum class BroadcastableOpCategory : uint8 {
+enum class BroadcastableOpCategory : uint8_t {
  kNone,
  kNonBroadcast,               // Matching input shapes.
  kFirstInputBroadcastsFast,   // Fivefold nested loops.
@ -729,21 +735,21 @@ static_assert(sizeof(MinMax) == 8, "");

 struct ActivationParams {
  FusedActivationFunctionType activation_type;
-  // uint8, etc, activation params.
-  int32 quantized_activation_min;
-  int32 quantized_activation_max;
+  // uint8_t, etc, activation params.
+  int32_t quantized_activation_min;
+  int32_t quantized_activation_max;
 };

 struct ReluParams : public ActivationParams {
-  int32 input_offset;
-  int32 output_offset;
-  int32 output_multiplier;
-  int32 output_shift;
+  int32_t input_offset;
+  int32_t output_offset;
+  int32_t output_multiplier;
+  int output_shift;
 };

 // Styles of resizing op usages. For example, kImageStyle can be used with a Pad
 // op for pattern-specific optimization.
-enum class ResizingCategory : uint8 {
+enum class ResizingCategory : uint8_t {
  kNone,
  kImageStyle,  // 4D, operating on inner dimensions, say {0, a, b, 0}.
  kGenericResize,
@ -753,24 +759,29 @@ enum class ResizingCategory : uint8 {
 struct ArithmeticParams {
  // Shape dependent / common to data / op types.
  BroadcastableOpCategory broadcast_category;
-  // uint8 inference params.
-  int32 input1_offset;
-  int32 input2_offset;
-  int32 output_offset;
-  int32 output_multiplier;
+  // uint8_t inference params.
+  int32_t input1_offset;
+  int32_t input2_offset;
+  int32_t output_offset;
+  int32_t output_multiplier;
  int output_shift;
-  // Add / Sub, not Mul, uint8 inference params.
+  // Add / Sub, not Mul, uint8_t inference params.
  int left_shift;
-  int32 input1_multiplier;
+  int32_t input1_multiplier;
  int input1_shift;
-  int32 input2_multiplier;
+  int32_t input2_multiplier;
  int input2_shift;
-  // uint8, etc, activation params.
-  int32 quantized_activation_min;
-  int32 quantized_activation_max;
+
+  // TODO(b/158622529): Union the following activation params.
+  // uint8_t, etc, activation params.
+  int32_t quantized_activation_min;
+  int32_t quantized_activation_max;
  // float activation params.
  float float_activation_min;
  float float_activation_max;
+  // int64_t activation params.
+  int64_t int64_activation_min;
+  int64_t int64_activation_max;

  // Processed output dimensions.
  // Let input "a" be the one that broadcasts in the faster-changing dimension.
@ -785,22 +796,22 @@ struct ArithmeticParams {
 };

 struct ConcatenationParams {
-  int8 axis;
-  const int32* input_zeropoint;
+  int8_t axis;
+  const int32_t* input_zeropoint;
  const float* input_scale;
-  uint16 inputs_count;
-  int32 output_zeropoint;
+  uint16_t inputs_count;
+  int32_t output_zeropoint;
  float output_scale;
 };

 struct ComparisonParams {
-  // uint8 inference params.
+  // uint8_t inference params.
  int left_shift;
-  int32 input1_offset;
-  int32 input1_multiplier;
+  int32_t input1_offset;
+  int32_t input1_multiplier;
  int input1_shift;
-  int32 input2_offset;
-  int32 input2_multiplier;
+  int32_t input2_offset;
+  int32_t input2_multiplier;
  int input2_shift;
  // Shape dependent / common to inference types.
  bool is_broadcast;
@ -810,81 +821,81 @@ struct ConvParams {
  PaddingType padding_type;
  PaddingValues padding_values;
  // TODO(starka): This was just "stride", so check that width+height is OK.
-  int16 stride_width;
-  int16 stride_height;
-  int16 dilation_width_factor;
-  int16 dilation_height_factor;
-  // uint8 inference params.
+  int16_t stride_width;
+  int16_t stride_height;
+  int16_t dilation_width_factor;
+  int16_t dilation_height_factor;
+  // uint8_t inference params.
  // TODO(b/65838351): Use smaller types if appropriate.
-  int32 input_offset;
-  int32 weights_offset;
-  int32 output_offset;
-  int32 output_multiplier;
+  int32_t input_offset;
+  int32_t weights_offset;
+  int32_t output_offset;
+  int32_t output_multiplier;
  int output_shift;
-  // uint8, etc, activation params.
-  int32 quantized_activation_min;
-  int32 quantized_activation_max;
+  // uint8_t, etc, activation params.
+  int32_t quantized_activation_min;
+  int32_t quantized_activation_max;
  // float activation params.
  float float_activation_min;
  float float_activation_max;
 };

 struct DepthToSpaceParams {
-  int32 block_size;
+  int32_t block_size;
 };

 struct DepthwiseParams {
  PaddingType padding_type;
  PaddingValues padding_values;
-  int16 stride_width;
-  int16 stride_height;
-  int16 dilation_width_factor;
-  int16 dilation_height_factor;
-  int16 depth_multiplier;
-  // uint8 inference params.
+  int16_t stride_width;
+  int16_t stride_height;
+  int16_t dilation_width_factor;
+  int16_t dilation_height_factor;
+  int16_t depth_multiplier;
+  // uint8_t inference params.
  // TODO(b/65838351): Use smaller types if appropriate.
-  int32 input_offset;
-  int32 weights_offset;
-  int32 output_offset;
-  int32 output_multiplier;
+  int32_t input_offset;
+  int32_t weights_offset;
+  int32_t output_offset;
+  int32_t output_multiplier;
  int output_shift;
-  // uint8, etc, activation params.
-  int32 quantized_activation_min;
-  int32 quantized_activation_max;
+  // uint8_t, etc, activation params.
+  int32_t quantized_activation_min;
+  int32_t quantized_activation_max;
  // float activation params.
  float float_activation_min;
  float float_activation_max;
-  const int32* output_multiplier_per_channel;
-  const int32* output_shift_per_channel;
+  const int32_t* output_multiplier_per_channel;
+  const int32_t* output_shift_per_channel;
 };

 struct DequantizationParams {
  double scale;
-  int32 zero_point;
+  int32_t zero_point;
 };

 struct PerChannelDequantizationParams {
  const float* scale;
-  const int32* zero_point;
-  int32 quantized_dimension;
+  const int32_t* zero_point;
+  int32_t quantized_dimension;
 };

 struct FakeQuantParams {
  MinMax minmax;
-  int32 num_bits;
+  int32_t num_bits;
 };

 struct FullyConnectedParams {
-  // uint8 inference params.
+  // uint8_t inference params.
  // TODO(b/65838351): Use smaller types if appropriate.
-  int32 input_offset;
-  int32 weights_offset;
-  int32 output_offset;
-  int32 output_multiplier;
+  int32_t input_offset;
+  int32_t weights_offset;
+  int32_t output_offset;
+  int32_t output_multiplier;
  int output_shift;
-  // uint8, etc, activation params.
-  int32 quantized_activation_min;
-  int32 quantized_activation_max;
+  // uint8_t, etc, activation params.
+  int32_t quantized_activation_min;
+  int32_t quantized_activation_max;
  // float activation params.
  float float_activation_min;
  float float_activation_max;
@ -895,16 +906,16 @@ struct FullyConnectedParams {
 };

 struct GatherParams {
-  int16 axis;
+  int16_t axis;
 };

 struct L2NormalizationParams {
-  // uint8 inference params.
-  int32 input_zero_point;
+  // uint8_t inference params.
+  int32_t input_zero_point;
 };

 struct LocalResponseNormalizationParams {
-  int32 range;
+  int32_t range;
  double bias;
  double alpha;
  double beta;
@ -932,50 +943,50 @@ struct HardSwishParams {
 };

 struct LogisticParams {
-  // uint8 inference params.
-  int32 input_zero_point;
-  int32 input_range_radius;
-  int32 input_multiplier;
+  // uint8_t inference params.
+  int32_t input_zero_point;
+  int32_t input_range_radius;
+  int32_t input_multiplier;
  int input_left_shift;
 };

 struct LstmCellParams {
-  int32 weights_zero_point;
-  int32 accum_multiplier;
+  int32_t weights_zero_point;
+  int32_t accum_multiplier;
  int accum_shift;
  int state_integer_bits;
 };

 struct MeanParams {
-  int8 axis_count;
-  int16 axis[4];
+  int8_t axis_count;
+  int16_t axis[4];
 };

 struct PackParams {
-  int8 axis;
-  const int32* input_zeropoint;
+  int8_t axis;
+  const int32_t* input_zeropoint;
  const float* input_scale;
-  uint16 inputs_count;
-  int32 output_zeropoint;
+  uint16_t inputs_count;
+  int32_t output_zeropoint;
  float output_scale;
 };

 struct PadParams {
-  int8 left_padding_count;
-  int32 left_padding[4];
-  int8 right_padding_count;
-  int32 right_padding[4];
+  int8_t left_padding_count;
+  int32_t left_padding[4];
+  int8_t right_padding_count;
+  int32_t right_padding[4];
  ResizingCategory resizing_category;
 };

 struct PreluParams {
-  int32 input_offset;
-  int32 alpha_offset;
-  int32 output_offset;
-  int32 output_multiplier_1;
-  int32 output_shift_1;
-  int32 output_multiplier_2;
-  int32 output_shift_2;
+  int32_t input_offset;
+  int32_t alpha_offset;
+  int32_t output_offset;
+  int32_t output_multiplier_1;
+  int output_shift_1;
+  int32_t output_multiplier_2;
+  int output_shift_2;
 };

 struct PoolParams {
@ -986,17 +997,17 @@ struct PoolParams {
  int stride_width;
  int filter_height;
  int filter_width;
-  // uint8, etc, activation params.
-  int32 quantized_activation_min;
-  int32 quantized_activation_max;
+  // uint8_t, etc, activation params.
+  int32_t quantized_activation_min;
+  int32_t quantized_activation_max;
  // float activation params.
  float float_activation_min;
  float float_activation_max;
 };

 struct ReshapeParams {
-  int8 shape_count;
-  int32 shape[4];
+  int8_t shape_count;
+  int32_t shape[4];
 };

 struct ResizeBilinearParams {
@ -1013,91 +1024,93 @@ struct ResizeNearestNeighborParams {
 };

 struct SliceParams {
-  int8 begin_count;
-  int32 begin[4];
-  int8 size_count;
-  int32 size[4];
+  int8_t begin_count;
+  int32_t begin[4];
+  int8_t size_count;
+  int32_t size[4];
 };

 struct SoftmaxParams {
  // beta is not really used (not a Tensorflow parameter) and not implemented
  // for LogSoftmax.
  double beta;
-  // uint8 inference params.  Used even when beta defaults to 1.0.
-  int32 input_multiplier;
-  int32 input_left_shift;
+  // uint8_t inference params.  Used even when beta defaults to 1.0.
+  int32_t input_multiplier;
+  int32_t input_left_shift;
  // Reverse scaling is only used by LogSoftmax.
-  int32 reverse_scaling_divisor;
-  int32 reverse_scaling_right_shift;
+  int32_t reverse_scaling_divisor;
+  int32_t reverse_scaling_right_shift;
  int diff_min;
  int32_t zero_point;
  float scale;
  float* table;
  int16_t* exp_lut;
  int16_t* one_over_one_plus_x_lut;
+  uint8_t* uint8_table1;
+  uint8_t* uint8_table2;
 };

 struct SpaceToBatchParams {
-  // "Zero" padding for uint8 means padding with the output offset.
-  int32 output_offset;
+  // "Zero" padding for uint8_t means padding with the output offset.
+  int32_t output_offset;
 };

 struct SpaceToDepthParams {
-  int32 block_size;
+  int32_t block_size;
 };

 struct SplitParams {
  // Graphs that split into, say, 2000 nodes are encountered.  The indices in
-  // OperatorEdges are of type uint16.
-  uint16 num_split;
-  int16 axis;
+  // OperatorEdges are of type uint16_t.
+  uint16_t num_split;
+  int16_t axis;
 };

 struct SqueezeParams {
-  int8 squeeze_dims_count;
-  int32 squeeze_dims[4];
+  int8_t squeeze_dims_count;
+  int32_t squeeze_dims[4];
 };

 struct StridedSliceParams {
-  int8 start_indices_count;
-  int32 start_indices[5];
-  int8 stop_indices_count;
-  int32 stop_indices[5];
-  int8 strides_count;
-  int32 strides[5];
+  int8_t start_indices_count;
+  int32_t start_indices[5];
+  int8_t stop_indices_count;
+  int32_t stop_indices[5];
+  int8_t strides_count;
+  int32_t strides[5];

-  int16 begin_mask;
-  int16 ellipsis_mask;
-  int16 end_mask;
-  int16 new_axis_mask;
-  int16 shrink_axis_mask;
+  int16_t begin_mask;
+  int16_t ellipsis_mask;
+  int16_t end_mask;
+  int16_t new_axis_mask;
+  int16_t shrink_axis_mask;
 };

 struct TanhParams {
-  int32 input_zero_point;
-  int32 input_range_radius;
-  int32 input_multiplier;
+  int32_t input_zero_point;
+  int32_t input_range_radius;
+  int32_t input_multiplier;
  int input_left_shift;
 };

 struct TransposeParams {
-  int8 perm_count;
-  int32 perm[5];
+  int8_t perm_count;
+  int32_t perm[5];
 };

 struct UnpackParams {
-  uint16 num_split;
-  int16 axis;
+  uint16_t num_split;
+  int16_t axis;
 };

 struct LeakyReluParams {
  float alpha;
-  int32 input_offset;
-  int32 output_offset;
-  int32 output_multiplier_alpha;
-  int32 output_shift_alpha;
-  int32 output_multiplier_identity;
-  int32 output_shift_identity;
+  int32_t input_offset;
+  int32_t output_offset;
+  int32_t output_multiplier_alpha;
+  int32_t output_shift_alpha;
+  int32_t output_multiplier_identity;
+  int32_t output_shift_identity;
 };

 template <typename P>
@ -1107,13 +1120,19 @@ inline void SetActivationParams(float min, float max, P* params) {
 }

 template <typename P>
-inline void SetActivationParams(int32 min, int32 max, P* params) {
+inline void SetActivationParams(int32_t min, int32_t max, P* params) {
  params->quantized_activation_min = min;
  params->quantized_activation_max = max;
 }

 template <typename P>
-inline void GetActivationParams(const P& params, int32* min, int32* max) {
+inline void SetActivationParams(int64_t min, int64_t max, P* params) {
+  params->int64_activation_min = min;
+  params->int64_activation_max = max;
+}
+
+template <typename P>
+inline void GetActivationParams(const P& params, int32_t* min, int32_t* max) {
  *min = params.quantized_activation_min;
  *max = params.quantized_activation_max;
 }
@ -1124,6 +1143,11 @@ inline void GetActivationParams(const P& params, float* min, float* max) {
  *max = params.float_activation_max;
 }

+template <typename P>
+inline void GetActivationParams(const P& params, int64_t* min, int64_t* max) {
+  *min = params.int64_activation_min;
+  *max = params.int64_activation_max;
+}
 }  // namespace tflite

 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_TYPES_H_
--- a/libs/tensorflow/tensorflow/lite/kernels/kernel_util.cc
+++ b/libs/tensorflow/tensorflow/lite/kernels/kernel_util.cc
@ -14,15 +14,63 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/kernels/kernel_util.h"

+#include <stdint.h>
+#include <stdlib.h>
+
 #include <algorithm>
-#include <cmath>
+#include <limits>
 #include <memory>

+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/cppmath.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"

 namespace tflite {

+namespace {
+
+inline TfLiteTensor* GetMutableInput(const TfLiteContext* context,
+                                     const TfLiteNode* node, int index) {
+  if (context->tensors != nullptr) {
+    return &context->tensors[node->inputs->data[index]];
+  } else {
+    return context->GetTensor(context, node->inputs->data[index]);
+  }
+}
+
+}  // anonymous namespace.
+
+const TfLiteTensor* GetInput(const TfLiteContext* context,
+                             const TfLiteNode* node, int index) {
+  return GetMutableInput(context, node, index);
+}
+
+TfLiteTensor* GetVariableInput(TfLiteContext* context, const TfLiteNode* node,
+                               int index) {
+  TfLiteTensor* tensor = GetMutableInput(context, node, index);
+  return tensor->is_variable ? tensor : nullptr;
+}
+
+TfLiteTensor* GetOutput(TfLiteContext* context, const TfLiteNode* node,
+                        int index) {
+  if (context->tensors != nullptr) {
+    return &context->tensors[node->outputs->data[index]];
+  } else {
+    return context->GetTensor(context, node->outputs->data[index]);
+  }
+}
+
+const TfLiteTensor* GetOptionalInputTensor(const TfLiteContext* context,
+                                           const TfLiteNode* node, int index) {
+  const bool use_tensor = index < node->inputs->size &&
+                          node->inputs->data[index] != kTfLiteOptionalTensor;
+  if (use_tensor) {
+    return GetMutableInput(context, node, index);
+  }
+  return nullptr;
+}
+
 // Per-axis
 TfLiteStatus PopulateConvolutionQuantizationParams(
    TfLiteContext* context, const TfLiteTensor* input,
@ -126,11 +174,27 @@ TfLiteStatus GetQuantizedConvolutionMultipler(TfLiteContext* context,
  // pipeline.
  if (bias) {
    const double bias_scale = static_cast<double>(bias->params.scale);
-    // Here we're making sure the input_product_scale & bias_scale the same.
-    // Normally this should be guaranteed by the training pipeline, we are
-    // setting the threshold to be 2e-6 to allow some numeric stability
-    // difference.
-    TF_LITE_ENSURE(context, std::abs(input_product_scale - bias_scale) <= 2e-6);
+    // Here we're making sure the input_product_scale & bias_scale are about the
+    // same. Since we have:
+    // (output - output_zp) * output_scale =
+    // input_product_scale * input_product + bias * bias_scale ---- (0)
+    //
+    // (0) equals:
+    // (input_product + bias) * input_product_scale ----- (1)
+    //           +
+    // bias * (bias_scale - input_product_scale)   ------ (2)
+    //
+    // For the real kernel computation, we're doing (1), so we really need to
+    // make sure (2) has minimum impact on the output, so:
+    // bias * (bias_scale - input_product_scale) / output_scale should be
+    // a small number for an integer.
+    // Since normally bias should be within a small range.
+    // We should expect (bias_scale - input_product_scale) / output_scale to
+    // be a small number like 0.02.
+    const double scale_diff = std::abs(input_product_scale - bias_scale);
+    const double output_scale = static_cast<double>(output->params.scale);
+
+    TF_LITE_ENSURE(context, scale_diff / output_scale <= 0.02);
  }
  return GetQuantizedConvolutionMultipler(context, input, filter, output,
                                          multiplier);
@ -167,7 +231,7 @@ void CalculateActivationRangeQuantizedImpl(TfLiteFusedActivation activation,
  } else if (activation == kTfLiteActRelu6) {
    *act_min = std::max(qmin, quantize(0.0));
    *act_max = std::min(qmax, quantize(6.0));
-  } else if (activation == kTfLiteActRelu1) {
+  } else if (activation == kTfLiteActReluN1To1) {
    *act_min = std::max(qmin, quantize(-1.0));
    *act_max = std::min(qmax, quantize(1.0));
  } else {
--- a/libs/tensorflow/tensorflow/lite/kernels/kernel_util.h
+++ b/libs/tensorflow/tensorflow/lite/kernels/kernel_util.h
@ -15,52 +15,55 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_KERNEL_UTIL_H_
 #define TENSORFLOW_LITE_KERNELS_KERNEL_UTIL_H_

-#include <algorithm>
+#include <stdint.h>
+
 #include <limits>

-#include "flatbuffers/flatbuffers.h"
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"

 namespace tflite {

+// A fair number of functions in this header have historically been inline.
+// It is ok to change functions to not be inline if the latency with
+// benchmark_model for MobileNet + MobileBERT is unaffected. If such a change is
+// made, move the newly non-inlined function declarations to the top of this
+// header file.
+const TfLiteTensor* GetInput(const TfLiteContext* context,
+                             const TfLiteNode* node, int index);
+
+// Note: You must check if result is not null:
+// TfLiteTensor* my_tensor = GetVariableInput(context, node, kMyTensorIdx);
+// TF_LITE_ENSURE(context, my_tensor != nullptr);
+TfLiteTensor* GetVariableInput(TfLiteContext* context, const TfLiteNode* node,
+                               int index);
+
+TfLiteTensor* GetOutput(TfLiteContext* context, const TfLiteNode* node,
+                        int index);
+
+const TfLiteTensor* GetOptionalInputTensor(const TfLiteContext* context,
+                                           const TfLiteNode* node, int index);
+
 inline int NumDimensions(const TfLiteTensor* t) { return t->dims->size; }
 inline int SizeOfDimension(const TfLiteTensor* t, int dim) {
  return t->dims->data[dim];
 }
-inline const TfLiteTensor* GetInput(TfLiteContext* context,
-                                    const TfLiteNode* node, int index) {
-  return &context
-              ->tensors[flatbuffers::EndianScalar(node->inputs->data[index])];
-}
-// Note: You must check if result is not null:
-// TfLiteTensor* my_tensor = GetVariableInput(context, node, kMyTensorIdx);
-// TF_LITE_ENSURE(context, my_tensor != nullptr);
-inline TfLiteTensor* GetVariableInput(TfLiteContext* context,
-                                      const TfLiteNode* node, int index) {
-  TfLiteTensor* tensor =
-      &context->tensors[flatbuffers::EndianScalar(node->inputs->data[index])];
-  return (tensor->is_variable) ? tensor : nullptr;
-}
-inline TfLiteTensor* GetOutput(TfLiteContext* context, const TfLiteNode* node,
-                               int index) {
-  return &context
-              ->tensors[flatbuffers::EndianScalar(node->outputs->data[index])];
-}
+
+#ifndef TF_LITE_STATIC_MEMORY
 inline TfLiteTensor* GetTemporary(TfLiteContext* context,
                                  const TfLiteNode* node, int index) {
-  return &context->tensors[flatbuffers::EndianScalar(
-      node->temporaries->data[index])];
+  return &context->tensors[node->temporaries->data[index]];
 }
 inline const TfLiteTensor* GetIntermediates(TfLiteContext* context,
                                            const TfLiteNode* node, int index) {
  return &context->tensors[node->intermediates->data[index]];
 }
-inline int NumInputs(const TfLiteNode* node) { return node->inputs->size; }
-inline int NumOutputs(const TfLiteNode* node) { return node->outputs->size; }
 inline int NumIntermediates(const TfLiteNode* node) {
  return node->intermediates->size;
 }
+#endif  // TF_LITE_STATIC_MEMORY
+inline int NumInputs(const TfLiteNode* node) { return node->inputs->size; }
+inline int NumOutputs(const TfLiteNode* node) { return node->outputs->size; }

 inline int64_t NumElements(const TfLiteIntArray* dims) {
  int64_t count = 1;
@ -74,19 +77,11 @@ inline int64_t NumElements(const TfLiteTensor* t) {
  return NumElements(t->dims);
 }

-inline const TfLiteTensor* GetOptionalInputTensor(TfLiteContext* context,
-                                                  const TfLiteNode* node,
-                                                  int index) {
-  const bool use_tensor = index < node->inputs->size &&
-                          node->inputs->data[index] != kTfLiteOptionalTensor;
-  if (use_tensor) {
-    return &context
-                ->tensors[flatbuffers::EndianScalar(node->inputs->data[index])];
-  }
-  return nullptr;
-}
-
 // Determines whether tensor is constant.
+// TODO(b/138199592): Introduce new query which checks for constant OR
+// persistent-read-only, which would be useful for most tensor kernels that
+// are potentially dynamic based on the input tensor value availability at the
+// time of prepare.
 inline bool IsConstantTensor(const TfLiteTensor* tensor) {
  return tensor->allocation_type == kTfLiteMmapRo;
 }
@ -105,6 +100,14 @@ inline void SetTensorToDynamic(TfLiteTensor* tensor) {
  }
 }

+// Sets tensor to persistent and read-only.
+inline void SetTensorToPersistentRo(TfLiteTensor* tensor) {
+  if (tensor->allocation_type != kTfLitePersistentRo) {
+    tensor->allocation_type = kTfLitePersistentRo;
+    tensor->data.raw = nullptr;
+  }
+}
+
 // Determines whether it is a hybrid op - one that has float inputs and
 // quantized weights.
 inline bool IsHybridOp(const TfLiteTensor* input, const TfLiteTensor* weight) {
@ -162,7 +165,7 @@ void CalculateActivationRange(TfLiteFusedActivation activation,
  } else if (activation == kTfLiteActRelu6) {
    *activation_min = 0;
    *activation_max = 6;
-  } else if (activation == kTfLiteActRelu1) {
+  } else if (activation == kTfLiteActReluN1To1) {
    *activation_min = -1;
    *activation_max = 1;
  } else {
--- a/libs/tensorflow/tensorflow/lite/kernels/op_macros.h
+++ b/libs/tensorflow/tensorflow/lite/kernels/op_macros.h
@ -19,7 +19,7 @@ limitations under the License.
 // non-portable function.
 #ifdef TF_LITE_MCU_DEBUG_LOG

-#include "tensorflow/lite/micro/micro_error_reporter.h"
+#include "tensorflow/lite/micro/debug_log.h"

 #define DEBUG_LOG(x) \
  do {               \
@ -36,7 +36,6 @@ inline void InfiniteLoop() {

 #else  // TF_LITE_MCU_DEBUG_LOG

-#include <cassert>
 #include <cstdio>
 #include <cstdlib>

@ -45,6 +44,15 @@ inline void InfiniteLoop() {
    fprintf(stderr, "%s", (x)); \
  } while (0)

+// Report Error for unsupported type by op 'op_name' and returns kTfLiteError.
+#define TF_LITE_UNSUPPORTED_TYPE(context, type, op_name)                    \
+  do {                                                                      \
+    TF_LITE_KERNEL_LOG((context), "%s:%d Type %s is unsupported by op %s.", \
+                       __FILE__, __LINE__, TfLiteTypeGetName(type),         \
+                       (op_name));                                          \
+    return kTfLiteError;                                                    \
+  } while (0)
+
 #define TFLITE_ABORT abort()

 #endif  // TF_LITE_MCU_DEBUG_LOG
--- a/libs/tensorflow/tensorflow/lite/micro/examples/hello_world/main_functions.cc
+++ b/libs/tensorflow/tensorflow/lite/micro/examples/hello_world/main_functions.cc
@ -15,10 +15,10 @@ limitations under the License.

 #include "tensorflow/lite/micro/examples/hello_world/main_functions.h"

+#include "tensorflow/lite/micro/all_ops_resolver.h"
 #include "tensorflow/lite/micro/examples/hello_world/constants.h"
 #include "tensorflow/lite/micro/examples/hello_world/model.h"
 #include "tensorflow/lite/micro/examples/hello_world/output_handler.h"
-#include "tensorflow/lite/micro/kernels/all_ops_resolver.h"
 #include "tensorflow/lite/micro/micro_error_reporter.h"
 #include "tensorflow/lite/micro/micro_interpreter.h"
 #include "tensorflow/lite/schema/schema_generated.h"
@ -34,8 +34,12 @@ TfLiteTensor* output = nullptr;
 int inference_count = 0;

 // Create an area of memory to use for input, output, and intermediate arrays.
-// Finding the minimum value for your model may require some trial and error.
-constexpr int kTensorArenaSize = 2 * 1024;
+// Minimum arena size, at the time of writing. After allocating tensors
+// you can retrieve this value by invoking interpreter.arena_used_bytes().
+const int kModelArenaSize = 2468;
+// Extra headroom for model + alignment + future interpreter changes.
+const int kExtraArenaSize = 560 + 16 + 100;
+const int kTensorArenaSize = kModelArenaSize + kExtraArenaSize;
 uint8_t tensor_arena[kTensorArenaSize];
 }  // namespace

@ -60,7 +64,7 @@ void setup() {

  // This pulls in all the operation implementations we need.
  // NOLINTNEXTLINE(runtime-global-variables)
-  static tflite::ops::micro::AllOpsResolver resolver;
+  static tflite::AllOpsResolver resolver;

  // Build an interpreter to run the model with.
  static tflite::MicroInterpreter static_interpreter(
--- a/libs/tensorflow/tensorflow/lite/micro/examples/hello_world/model.cc
+++ b/libs/tensorflow/tensorflow/lite/micro/examples/hello_world/model.cc
@ -24,19 +24,8 @@ limitations under the License.

 #include "tensorflow/lite/micro/examples/hello_world/model.h"

-// We need to keep the data array aligned on some architectures.
-#ifdef __has_attribute
-#define HAVE_ATTRIBUTE(x) __has_attribute(x)
-#else
-#define HAVE_ATTRIBUTE(x) 0
-#endif
-#if HAVE_ATTRIBUTE(aligned) || (defined(__GNUC__) && !defined(__clang__))
-#define DATA_ALIGN_ATTRIBUTE __attribute__((aligned(4)))
-#else
-#define DATA_ALIGN_ATTRIBUTE
-#endif
-
-const unsigned char g_model[] DATA_ALIGN_ATTRIBUTE = {
+// Keep model aligned to 8 bytes to guarantee aligned 64-bit accesses.
+alignas(8) const unsigned char g_model[] = {
    0x1c, 0x00, 0x00, 0x00, 0x54, 0x46, 0x4c, 0x33, 0x00, 0x00, 0x12, 0x00,
    0x1c, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x14, 0x00,
    0x00, 0x00, 0x18, 0x00, 0x12, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
--- a/libs/tensorflow/tensorflow/lite/micro/kernels/activation_utils.h
+++ b/libs/tensorflow/tensorflow/lite/micro/kernels/activation_utils.h
@ -21,6 +21,8 @@ limitations under the License.

 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/kernels/internal/cppmath.h"
+#include "tensorflow/lite/kernels/internal/max.h"
+#include "tensorflow/lite/kernels/internal/min.h"

 namespace tflite {
 namespace ops {
@ -32,11 +34,11 @@ inline float ActivationValFloat(TfLiteFusedActivation act, float a) {
    case kTfLiteActNone:
      return a;
    case kTfLiteActRelu:
-      return std::max(0.0f, a);
-    case kTfLiteActRelu1:
-      return std::max(-1.0f, std::min(a, 1.0f));
+      return TfLiteMax(0.0f, a);
+    case kTfLiteActReluN1To1:
+      return TfLiteMax(-1.0f, TfLiteMin(a, 1.0f));
    case kTfLiteActRelu6:
-      return std::max(0.0f, std::min(a, 6.0f));
+      return TfLiteMax(0.0f, TfLiteMin(a, 6.0f));
    case kTfLiteActTanh:
      return std::tanh(a);
    case kTfLiteActSignBit:
--- a/libs/tensorflow/tensorflow/lite/micro/kernels/activations.cc
+++ b/libs/tensorflow/tensorflow/lite/micro/kernels/activations.cc
@ -18,30 +18,82 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/micro_utils.h"

 namespace tflite {
 namespace ops {
 namespace micro {
 namespace activations {
+namespace {
+
+struct ReluOpData {
+  ReluParams params;
+};
+
+struct Relu6OpData {
+  int8_t six_int8;
+  int8_t zero_int8;
+  uint8_t six_uint8;
+  uint8_t zero_uint8;
+};
+
+}  // namespace

 constexpr int kInputTensor = 0;
 constexpr int kOutputTensor = 0;

-template <typename Q>
-inline void ReluQuantized(int32_t lower, const RuntimeShape& input_shape,
-                          const Q* input_data, const RuntimeShape& output_shape,
-                          Q* output_data) {
+template <typename T>
+inline void ReluQuantized(const ReluOpData& data,
+                          const RuntimeShape& input_shape,
+                          const RuntimeShape& output_shape, const T* input_data,
+                          T* output_data) {
  const int flat_size = MatchingFlatSize(input_shape, output_shape);
  for (int i = 0; i < flat_size; ++i) {
-    const Q val = input_data[i];
-    const Q clamped = val < lower ? lower : val;
-    output_data[i] = clamped;
+    const int32_t val = static_cast<int32_t>(input_data[i]);
+    int32_t clamped =
+        data.params.output_offset +
+        MultiplyByQuantizedMultiplier(val - data.params.input_offset,
+                                      data.params.output_multiplier,
+                                      data.params.output_shift);
+    clamped = std::max(data.params.quantized_activation_min, clamped);
+    clamped = std::min(data.params.quantized_activation_max, clamped);
+    output_data[i] = static_cast<T>(clamped);
  }
 }

+template <typename T>
+inline void CalculateReluOpData(const TfLiteTensor* input, TfLiteTensor* output,
+                                ReluOpData* data) {
+  float act_min = 0.0;
+  float act_max = std::numeric_limits<float>::infinity();
+  double real_multiplier =
+      static_cast<double>(input->params.scale / output->params.scale);
+
+  const RuntimeShape input_shape = GetTensorShape(input);
+  const RuntimeShape output_shape = GetTensorShape(output);
+
+  QuantizeMultiplier(real_multiplier, &data->params.output_multiplier,
+                     &data->params.output_shift);
+
+  data->params.quantized_activation_min = std::max(
+      static_cast<int32_t>(std::numeric_limits<T>::min()),
+      output->params.zero_point +
+          static_cast<int32_t>(roundf(act_min / output->params.scale)));
+  data->params.quantized_activation_max =
+      act_max == std::numeric_limits<float>::infinity()
+          ? static_cast<int32_t>(std::numeric_limits<T>::max())
+          : std::min(static_cast<int32_t>(std::numeric_limits<T>::max()),
+                     output->params.zero_point +
+                         static_cast<int32_t>(
+                             roundf(act_max / output->params.scale)));
+  data->params.input_offset = input->params.zero_point;
+  data->params.output_offset = output->params.zero_point;
+}
+
 inline void ReluFloat(const RuntimeShape& input_shape, const float* input_data,
                      const RuntimeShape& output_shape, float* output_data) {
  const int flat_size = MatchingFlatSize(input_shape, output_shape);
@ -77,33 +129,57 @@ inline void Relu6Quantized(Q lower, Q upper, const RuntimeShape& input_shape,
  }
 }

+void* ReluInit(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(ReluOpData));
+}
+
 TfLiteStatus ReluPrepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  ReluOpData* data = static_cast<ReluOpData*>(node->user_data);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  if (input->type == kTfLiteInt8) {
+    CalculateReluOpData<int8_t>(input, output, data);
+  } else if (input->type == kTfLiteUInt8) {
+    CalculateReluOpData<uint8_t>(input, output, data);
+  }
+
  return kTfLiteOk;
 }

 TfLiteStatus ReluEval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const ReluOpData& data = *(static_cast<const ReluOpData*>(node->user_data));
+
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);

  switch (input->type) {
    case kTfLiteFloat32: {
-      ReluFloat(GetTensorShape(input), GetTensorData<float>(input),
-                GetTensorShape(output), GetTensorData<float>(output));
+      ReluFloat(tflite::micro::GetTensorShape(input),
+                tflite::micro::GetTensorData<float>(input),
+                tflite::micro::GetTensorShape(output),
+                tflite::micro::GetTensorData<float>(output));

      return kTfLiteOk;
    }
    case kTfLiteInt8: {
-      ReluQuantized<int8_t>(input->params.zero_point, GetTensorShape(input),
-                            GetTensorData<int8_t>(input),
-                            GetTensorShape(output),
-                            GetTensorData<int8_t>(output));
+      ReluQuantized<int8_t>(data, tflite::micro::GetTensorShape(input),
+                            tflite::micro::GetTensorShape(output),
+                            tflite::micro::GetTensorData<int8_t>(input),
+                            tflite::micro::GetTensorData<int8_t>(output));
      return kTfLiteOk;
    }
    case kTfLiteUInt8: {
-      ReluQuantized<uint8_t>(input->params.zero_point, GetTensorShape(input),
-                             GetTensorData<uint8_t>(input),
-                             GetTensorShape(output),
-                             GetTensorData<uint8_t>(output));
+      ReluQuantized<uint8_t>(data, tflite::micro::GetTensorShape(input),
+                             tflite::micro::GetTensorShape(output),
+                             tflite::micro::GetTensorData<uint8_t>(input),
+                             tflite::micro::GetTensorData<uint8_t>(output));
      return kTfLiteOk;
    }
    default: {
@ -114,37 +190,62 @@ TfLiteStatus ReluEval(TfLiteContext* context, TfLiteNode* node) {
  }
 }

+void* Relu6Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(Relu6OpData));
+}
+
 TfLiteStatus Relu6Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  Relu6OpData* data = static_cast<Relu6OpData*>(node->user_data);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+
+  if (input->type == kTfLiteInt8) {
+    data->six_int8 = FloatToAsymmetricQuantizedInt8(6.0f, input->params.scale,
+                                                    input->params.zero_point);
+    data->zero_int8 = input->params.zero_point;
+  } else if (input->type == kTfLiteUInt8) {
+    data->six_uint8 = FloatToAsymmetricQuantizedUInt8(6.0f, input->params.scale,
+                                                      input->params.zero_point);
+    data->zero_uint8 = input->params.zero_point;
+  }
+
  return kTfLiteOk;
 }

 TfLiteStatus Relu6Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const Relu6OpData& data = *(static_cast<const Relu6OpData*>(node->user_data));
+
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);

  switch (input->type) {
    case kTfLiteFloat32: {
-      Relu6Float(GetTensorShape(input), GetTensorData<float>(input),
-                 GetTensorShape(output), GetTensorData<float>(output));
+      Relu6Float(tflite::micro::GetTensorShape(input),
+                 tflite::micro::GetTensorData<float>(input),
+                 tflite::micro::GetTensorShape(output),
+                 tflite::micro::GetTensorData<float>(output));

      return kTfLiteOk;
    }
    case kTfLiteInt8: {
-      const int8_t six = FloatToAsymmetricQuantizedInt8(
-          6.0f, input->params.scale, input->params.zero_point);
-      const int8_t zero = input->params.zero_point;
-      Relu6Quantized<int8_t>(
-          zero, six, GetTensorShape(input), GetTensorData<int8_t>(input),
-          GetTensorShape(output), GetTensorData<int8_t>(output));
+      Relu6Quantized<int8_t>(data.zero_int8, data.six_int8,
+                             tflite::micro::GetTensorShape(input),
+                             tflite::micro::GetTensorData<int8_t>(input),
+                             tflite::micro::GetTensorShape(output),
+                             tflite::micro::GetTensorData<int8_t>(output));
      return kTfLiteOk;
    }
    case kTfLiteUInt8: {
-      const uint8_t six = FloatToAsymmetricQuantizedUInt8(
-          6.0f, input->params.scale, input->params.zero_point);
-      const uint8_t zero = input->params.zero_point;
-      Relu6Quantized<uint8_t>(
-          zero, six, GetTensorShape(input), GetTensorData<uint8_t>(input),
-          GetTensorShape(output), GetTensorData<uint8_t>(output));
+      Relu6Quantized<uint8_t>(data.zero_uint8, data.six_uint8,
+                              tflite::micro::GetTensorShape(input),
+                              tflite::micro::GetTensorData<uint8_t>(input),
+                              tflite::micro::GetTensorShape(output),
+                              tflite::micro::GetTensorData<uint8_t>(output));
      return kTfLiteOk;
    }
    default: {
@ -157,28 +258,26 @@ TfLiteStatus Relu6Eval(TfLiteContext* context, TfLiteNode* node) {

 }  // namespace activations

-TfLiteRegistration* Register_RELU() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
-                                 /*free=*/nullptr,
-                                 /*prepare=*/activations::ReluPrepare,
-                                 /*invoke=*/activations::ReluEval,
-                                 /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
-                                 /*version=*/0};
-  return &r;
+TfLiteRegistration Register_RELU() {
+  return {/*init=*/activations::ReluInit,
+          /*free=*/nullptr,
+          /*prepare=*/activations::ReluPrepare,
+          /*invoke=*/activations::ReluEval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
 }

-TfLiteRegistration* Register_RELU6() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
-                                 /*free=*/nullptr,
-                                 /*prepare=*/activations::Relu6Prepare,
-                                 /*invoke=*/activations::Relu6Eval,
-                                 /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
-                                 /*version=*/0};
-  return &r;
+TfLiteRegistration Register_RELU6() {
+  return {/*init=*/activations::Relu6Init,
+          /*free=*/nullptr,
+          /*prepare=*/activations::Relu6Prepare,
+          /*invoke=*/activations::Relu6Eval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
 }

 }  // namespace micro
--- a/libs/tensorflow/tensorflow/lite/micro/kernels/add.cc
+++ b/libs/tensorflow/tensorflow/lite/micro/kernels/add.cc
@ -23,6 +23,8 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/memory_helpers.h"

 namespace tflite {
 namespace ops {
@ -40,18 +42,22 @@ struct OpData {
  // and the special 16-bit -> 16bit quantized path
  int input1_shift;
  int input2_shift;
-  int32 output_activation_min;
-  int32 output_activation_max;
+  int32_t output_activation_min;
+  int32_t output_activation_max;

  // These fields are used only in the general 8-bit -> 8bit quantized path
-  int32 input1_multiplier;
-  int32 input2_multiplier;
-  int32 output_multiplier;
+  int32_t input1_multiplier;
+  int32_t input2_multiplier;
+  int32_t output_multiplier;
  int output_shift;
  int left_shift;
-  int32 input1_offset;
-  int32 input2_offset;
-  int32 output_offset;
+  int32_t input1_offset;
+  int32_t input2_offset;
+  int32_t output_offset;
+
+  // Used only for float evals:
+  float output_activation_min_f32;
+  float output_activation_max_f32;
 };

 TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteAddParams* params,
@ -89,24 +95,28 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteAddParams* params,
    TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
        context, params->activation, output, &data->output_activation_min,
        &data->output_activation_max));
+  } else if (output->type == kTfLiteFloat32) {
+    CalculateActivationRange(params->activation,
+                             &data->output_activation_min_f32,
+                             &data->output_activation_max_f32);
  }

  return kTfLiteOk;
 }

 void EvalAdd(TfLiteContext* context, TfLiteNode* node, TfLiteAddParams* params,
-             const OpData* data, const TfLiteTensor* input1,
-             const TfLiteTensor* input2, TfLiteTensor* output) {
-  float output_activation_min, output_activation_max;
-  CalculateActivationRange(params->activation, &output_activation_min,
-                           &output_activation_max);
+             const OpData* data, const TfLiteEvalTensor* input1,
+             const TfLiteEvalTensor* input2, TfLiteEvalTensor* output) {
  tflite::ArithmeticParams op_params;
-  SetActivationParams(output_activation_min, output_activation_max, &op_params);
-#define TF_LITE_ADD(opname)                                                   \
-  reference_ops::opname(op_params, GetTensorShape(input1),                    \
-                        GetTensorData<float>(input1), GetTensorShape(input2), \
-                        GetTensorData<float>(input2), GetTensorShape(output), \
-                        GetTensorData<float>(output))
+  SetActivationParams(data->output_activation_min_f32,
+                      data->output_activation_max_f32, &op_params);
+#define TF_LITE_ADD(opname)                                               \
+  reference_ops::opname(op_params, tflite::micro::GetTensorShape(input1), \
+                        tflite::micro::GetTensorData<float>(input1),      \
+                        tflite::micro::GetTensorShape(input2),            \
+                        tflite::micro::GetTensorData<float>(input2),      \
+                        tflite::micro::GetTensorShape(output),            \
+                        tflite::micro::GetTensorData<float>(output))
  if (data->requires_broadcast) {
    TF_LITE_ADD(BroadcastAdd4DSlow);
  } else {
@ -117,9 +127,9 @@ void EvalAdd(TfLiteContext* context, TfLiteNode* node, TfLiteAddParams* params,

 TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
                              TfLiteAddParams* params, const OpData* data,
-                              const TfLiteTensor* input1,
-                              const TfLiteTensor* input2,
-                              TfLiteTensor* output) {
+                              const TfLiteEvalTensor* input1,
+                              const TfLiteEvalTensor* input2,
+                              TfLiteEvalTensor* output) {
  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
    tflite::ArithmeticParams op_params;
    op_params.left_shift = data->left_shift;
@ -135,12 +145,15 @@ TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
    SetActivationParams(data->output_activation_min,
                        data->output_activation_max, &op_params);
    bool need_broadcast = reference_ops::ProcessBroadcastShapes(
-        GetTensorShape(input1), GetTensorShape(input2), &op_params);
-#define TF_LITE_ADD(type, opname, dtype)                             \
-  type::opname(op_params, GetTensorShape(input1),                    \
-               GetTensorData<dtype>(input1), GetTensorShape(input2), \
-               GetTensorData<dtype>(input2), GetTensorShape(output), \
-               GetTensorData<dtype>(output));
+        tflite::micro::GetTensorShape(input1),
+        tflite::micro::GetTensorShape(input2), &op_params);
+#define TF_LITE_ADD(type, opname, dtype)                         \
+  type::opname(op_params, tflite::micro::GetTensorShape(input1), \
+               tflite::micro::GetTensorData<dtype>(input1),      \
+               tflite::micro::GetTensorShape(input2),            \
+               tflite::micro::GetTensorData<dtype>(input2),      \
+               tflite::micro::GetTensorShape(output),            \
+               tflite::micro::GetTensorData<dtype>(output));
    if (output->type == kTfLiteInt8) {
      if (need_broadcast) {
        TF_LITE_ADD(reference_integer_ops, BroadcastAdd4DSlow, int8_t);
@ -160,21 +173,45 @@ TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
  return kTfLiteOk;
 }

-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  auto* params = reinterpret_cast<TfLiteAddParams*>(node->builtin_data);
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);

  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);

-  OpData data;
+  OpData* data = static_cast<OpData*>(node->user_data);
+  auto* params = reinterpret_cast<TfLiteAddParams*>(node->builtin_data);
+
  TF_LITE_ENSURE_STATUS(
-      CalculateOpData(context, params, input1, input2, output, &data));
+      CalculateOpData(context, params, input1, input2, output, data));
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLiteAddParams*>(node->builtin_data);
+
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpData* data = static_cast<const OpData*>(node->user_data);
+
+  const TfLiteEvalTensor* input1 =
+      tflite::micro::GetEvalInput(context, node, kInputTensor1);
+  const TfLiteEvalTensor* input2 =
+      tflite::micro::GetEvalInput(context, node, kInputTensor2);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);

  if (output->type == kTfLiteFloat32) {
-    EvalAdd(context, node, params, &data, input1, input2, output);
+    EvalAdd(context, node, params, data, input1, input2, output);
  } else if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
-    TF_LITE_ENSURE_OK(context, EvalAddQuantized(context, node, params, &data,
+    TF_LITE_ENSURE_OK(context, EvalAddQuantized(context, node, params, data,
                                                input1, input2, output));
  } else {
    TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
@ -187,16 +224,15 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {

 }  // namespace add

-TfLiteRegistration* Register_ADD() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
-                                 /*free=*/nullptr,
-                                 /*prepare=*/nullptr,
-                                 /*invoke=*/add::Eval,
-                                 /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
-                                 /*version=*/0};
-  return &r;
+TfLiteRegistration Register_ADD() {
+  return {/*init=*/add::Init,
+          /*free=*/nullptr,
+          /*prepare=*/add::Prepare,
+          /*invoke=*/add::Eval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
 }

 }  // namespace micro
--- a/libs/tensorflow/tensorflow/lite/micro/kernels/arg_min_max.cc
+++ b/libs/tensorflow/tensorflow/lite/micro/kernels/arg_min_max.cc
@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/kernels/micro_utils.h"

 namespace tflite {
@ -45,14 +46,20 @@ inline void ArgMinMaxHelper(const RuntimeShape& input1_shape,
 }

 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node, bool is_arg_max) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* axis = GetInput(context, node, kAxis);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  const TfLiteEvalTensor* axis =
+      tflite::micro::GetEvalInput(context, node, kAxis);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);

-#define TF_LITE_ARG_MIN_MAX(data_type, axis_type, output_type)            \
-  ArgMinMaxHelper(GetTensorShape(input), GetTensorData<data_type>(input), \
-                  GetTensorData<axis_type>(axis), GetTensorShape(output), \
-                  GetTensorData<output_type>(output), is_arg_max)
+#define TF_LITE_ARG_MIN_MAX(data_type, axis_type, output_type)       \
+  ArgMinMaxHelper(tflite::micro::GetTensorShape(input),              \
+                  tflite::micro::GetTensorData<data_type>(input),    \
+                  tflite::micro::GetTensorData<axis_type>(axis),     \
+                  tflite::micro::GetTensorShape(output),             \
+                  tflite::micro::GetTensorData<output_type>(output), \
+                  is_arg_max)
  if (axis->type == kTfLiteInt32) {
    if (output->type == kTfLiteInt32) {
      switch (input->type) {
@ -67,18 +74,19 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node, bool is_arg_max) {
          break;
        default:
          TF_LITE_KERNEL_LOG(context,
-                             "Only float32, uint8 and int8 are "
+                             "Only float32, uint8_t and int8_t are "
                             "supported currently, got %s.",
                             TfLiteTypeGetName(input->type));
          return kTfLiteError;
      }
    } else {
-      TF_LITE_KERNEL_LOG(context, "Only int32 are supported currently, got %s.",
+      TF_LITE_KERNEL_LOG(context,
+                         "Only int32_t are supported currently, got %s.",
                         TfLiteTypeGetName(output->type));
      return kTfLiteError;
    }
  } else {
-    TF_LITE_KERNEL_LOG(context, "Only int32 are supported currently, got %s.",
+    TF_LITE_KERNEL_LOG(context, "Only int32_t are supported currently, got %s.",
                       TfLiteTypeGetName(axis->type));
    return kTfLiteError;
  }
@ -98,28 +106,26 @@ TfLiteStatus ArgMaxEval(TfLiteContext* context, TfLiteNode* node) {

 }  // namespace arg_min_max

-TfLiteRegistration* Register_ARG_MAX() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
-                                 /*free=*/nullptr,
-                                 /*prepare=*/nullptr,
-                                 /*invoke=*/arg_min_max::ArgMaxEval,
-                                 /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
-                                 /*version=*/0};
-  return &r;
+TfLiteRegistration Register_ARG_MAX() {
+  return {/*init=*/nullptr,
+          /*free=*/nullptr,
+          /*prepare=*/nullptr,
+          /*invoke=*/arg_min_max::ArgMaxEval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
 }

-TfLiteRegistration* Register_ARG_MIN() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
-                                 /*free=*/nullptr,
-                                 /*prepare=*/nullptr,
-                                 /*invoke=*/arg_min_max::ArgMinEval,
-                                 /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
-                                 /*version=*/0};
-  return &r;
+TfLiteRegistration Register_ARG_MIN() {
+  return {/*init=*/nullptr,
+          /*free=*/nullptr,
+          /*prepare=*/nullptr,
+          /*invoke=*/arg_min_max::ArgMinEval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
 }

 }  // namespace micro
--- a/libs/tensorflow/tensorflow/lite/micro/kernels/ceil.cc
+++ b/libs/tensorflow/tensorflow/lite/micro/kernels/ceil.cc
@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"

 namespace tflite {
 namespace ops {
@ -32,8 +33,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
-  TF_LITE_ENSURE_EQ(context, output->type, input->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteFloat32);
+  TF_LITE_ENSURE_TYPES_EQ(context, output->type, input->type);
  TF_LITE_ENSURE_EQ(context, output->bytes, input->bytes);
  TF_LITE_ENSURE_EQ(context, output->dims->size, input->dims->size);
  for (int i = 0; i < output->dims->size; ++i) {
@ -43,26 +44,29 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 }

 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);

-  reference_ops::Ceil(GetTensorShape(input), GetTensorData<float>(input),
-                      GetTensorShape(output), GetTensorData<float>(output));
+  reference_ops::Ceil(tflite::micro::GetTensorShape(input),
+                      tflite::micro::GetTensorData<float>(input),
+                      tflite::micro::GetTensorShape(output),
+                      tflite::micro::GetTensorData<float>(output));

  return kTfLiteOk;
 }
 }  // namespace ceil

-TfLiteRegistration* Register_CEIL() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
-                                 /*free=*/nullptr,
-                                 /*prepare=*/ceil::Prepare,
-                                 /*invoke=*/ceil::Eval,
-                                 /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
-                                 /*version=*/0};
-  return &r;
+TfLiteRegistration Register_CEIL() {
+  return {/*init=*/nullptr,
+          /*free=*/nullptr,
+          /*prepare=*/ceil::Prepare,
+          /*invoke=*/ceil::Eval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
 }

 }  // namespace micro
--- a/libs/tensorflow/tensorflow/lite/micro/kernels/circular_buffer.cc
+++ b/libs/tensorflow/tensorflow/lite/micro/kernels/circular_buffer.cc
@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"

 /*
 * The circular buffer custom operator is used to implement strided streaming
@ -89,10 +90,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  TF_LITE_ENSURE_EQ(context, 1, input->dims->data[2]);
  TF_LITE_ENSURE_EQ(context, output->dims->data[3], input->dims->data[3]);

-  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);

-  // The circular buffer custom operator currently only supports int8.
-  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteInt8);
+  // The circular buffer custom operator currently only supports int8_t.
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteInt8);

  // TODO(b/132070898): Use statically slotted OpData structures until a
  // scratch memory API is ready.
@ -121,8 +122,10 @@ void EvalInt8(const int8_t* input, int num_slots, int depth, int8_t* output) {
 }

 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);

  OpData* data = reinterpret_cast<OpData*>(node->user_data);

@ -130,8 +133,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
  int depth = output->dims->data[3];

  if (input->type == kTfLiteInt8) {
-    EvalInt8(GetTensorData<int8_t>(input), num_slots, depth,
-             GetTensorData<int8_t>(output));
+    EvalInt8(tflite::micro::GetTensorData<int8_t>(input), num_slots, depth,
+             tflite::micro::GetTensorData<int8_t>(output));
  } else {
    TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
                       TfLiteTypeGetName(input->type), input->type);
--- a/libs/tensorflow/tensorflow/lite/micro/kernels/comparisons.cc
+++ b/libs/tensorflow/tensorflow/lite/micro/kernels/comparisons.cc
@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"

 namespace tflite {
 namespace ops {
@ -25,103 +26,109 @@ namespace micro {
 namespace comparisons {
 namespace {

+struct OpData {
+  ComparisonParams params;
+};
+
 constexpr int kInputTensor1 = 0;
 constexpr int kInputTensor2 = 1;
 constexpr int kOutputTensor = 0;

-// TODO(ruic): optimize macros below to using template functions.
-#define TF_LITE_QUANTIZE_COMPARISON(opname)                                    \
-  template <typename input_dtype>                                              \
-  void EvalQuantized##opname(TfLiteContext* context, TfLiteNode* node,         \
-                             const TfLiteTensor* input1,                       \
-                             const TfLiteTensor* input2, TfLiteTensor* output, \
-                             bool requires_broadcast) {                        \
-    if (input1->type == kTfLiteUInt8 || input1->type == kTfLiteInt8) {         \
-      auto input1_offset = -input1->params.zero_point;                         \
-      auto input2_offset = -input2->params.zero_point;                         \
-      const int left_shift = 8;                                                \
-                                                                               \
-      int32 input1_multiplier;                                                 \
-      int input1_shift;                                                        \
-      QuantizeMultiplierSmallerThanOneExp(                                     \
-          static_cast<double>(input1->params.scale), &input1_multiplier,       \
-          &input1_shift);                                                      \
-      int32 input2_multiplier;                                                 \
-      int input2_shift;                                                        \
-      QuantizeMultiplierSmallerThanOneExp(                                     \
-          static_cast<double>(input2->params.scale), &input2_multiplier,       \
-          &input2_shift);                                                      \
-                                                                               \
-      ComparisonParams op_params;                                              \
-      op_params.left_shift = left_shift;                                       \
-      op_params.input1_offset = input1_offset;                                 \
-      op_params.input1_multiplier = input1_multiplier;                         \
-      op_params.input1_shift = input1_shift;                                   \
-      op_params.input2_offset = input2_offset;                                 \
-      op_params.input2_multiplier = input2_multiplier;                         \
-      op_params.input2_shift = input2_shift;                                   \
-      if (requires_broadcast) {                                                \
-        reference_ops::Broadcast4DSlow##opname##WithScaling(                   \
-            op_params, GetTensorShape(input1),                                 \
-            GetTensorData<input_dtype>(input1), GetTensorShape(input2),        \
-            GetTensorData<input_dtype>(input2), GetTensorShape(output),        \
-            GetTensorData<bool>(output));                                      \
-      } else {                                                                 \
-        reference_ops::opname##WithScaling(                                    \
-            op_params, GetTensorShape(input1),                                 \
-            GetTensorData<input_dtype>(input1), GetTensorShape(input2),        \
-            GetTensorData<input_dtype>(input2), GetTensorShape(output),        \
-            GetTensorData<bool>(output));                                      \
-      }                                                                        \
-    }                                                                          \
-  }
-TF_LITE_QUANTIZE_COMPARISON(Equal);
-TF_LITE_QUANTIZE_COMPARISON(NotEqual);
-TF_LITE_QUANTIZE_COMPARISON(Greater);
-TF_LITE_QUANTIZE_COMPARISON(GreaterEqual);
-TF_LITE_QUANTIZE_COMPARISON(Less);
-TF_LITE_QUANTIZE_COMPARISON(LessEqual);
-#undef TF_LITE_QUANTIZE_COMPARISON
-
-#define TF_LITE_COMPARISON(type, opname, requires_broadcast)                  \
-  {                                                                           \
-    ComparisonParams op_params;                                               \
-    requires_broadcast                                                        \
-        ? reference_ops::Broadcast4DSlow##opname##NoScaling(                  \
-              op_params, GetTensorShape(input1), GetTensorData<type>(input1), \
-              GetTensorShape(input2), GetTensorData<type>(input2),            \
-              GetTensorShape(output), GetTensorData<bool>(output))            \
-        : reference_ops::opname##NoScaling(                                   \
-              op_params, GetTensorShape(input1), GetTensorData<type>(input1), \
-              GetTensorShape(input2), GetTensorData<type>(input2),            \
-              GetTensorShape(output), GetTensorData<bool>(output));           \
-  }
-
 TfLiteStatus EqualEval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  bool requires_broadcast = !HaveSameShapes(input1, input2);
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpData* data = static_cast<const OpData*>(node->user_data);
+
+  const TfLiteEvalTensor* input1 =
+      tflite::micro::GetEvalInput(context, node, kInputTensor1);
+  const TfLiteEvalTensor* input2 =
+      tflite::micro::GetEvalInput(context, node, kInputTensor2);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+
+  RuntimeShape input1_shape = tflite::micro::GetTensorShape(input1);
+  RuntimeShape input2_shape = tflite::micro::GetTensorShape(input2);
+  RuntimeShape output_shape = tflite::micro::GetTensorShape(output);
+  bool* output_data = tflite::micro::GetTensorData<bool>(output);
+
+  bool requires_broadcast = !tflite::micro::HaveSameShapes(input1, input2);
  switch (input1->type) {
    case kTfLiteBool:
-      TF_LITE_COMPARISON(bool, Equal, requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowEqualNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<bool>(input1), input2_shape,
+                tflite::micro::GetTensorData<bool>(input2), output_shape,
+                output_data)
+          : reference_ops::EqualNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<bool>(input1), input2_shape,
+                tflite::micro::GetTensorData<bool>(input2), output_shape,
+                output_data);
      break;
    case kTfLiteFloat32:
-      TF_LITE_COMPARISON(float, Equal, requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowEqualNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<float>(input1), input2_shape,
+                tflite::micro::GetTensorData<float>(input2), output_shape,
+                output_data)
+          : reference_ops::EqualNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<float>(input1), input2_shape,
+                tflite::micro::GetTensorData<float>(input2), output_shape,
+                output_data);
      break;
    case kTfLiteInt32:
-      TF_LITE_COMPARISON(int32_t, Equal, requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowEqualNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int32_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int32_t>(input2), output_shape,
+                output_data)
+          : reference_ops::EqualNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int32_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int32_t>(input2), output_shape,
+                output_data);
      break;
    case kTfLiteInt64:
-      TF_LITE_COMPARISON(int64_t, Equal, requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowEqualNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int64_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int64_t>(input2), output_shape,
+                output_data)
+          : reference_ops::EqualNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int64_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int64_t>(input2), output_shape,
+                output_data);
      break;
    case kTfLiteUInt8:
-      EvalQuantizedEqual<uint8_t>(context, node, input1, input2, output,
-                                  requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowEqualWithScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<uint8_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<uint8_t>(input2), output_shape,
+                output_data)
+          : reference_ops::EqualWithScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<uint8_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<uint8_t>(input2), output_shape,
+                output_data);
      break;
    case kTfLiteInt8:
-      EvalQuantizedEqual<int8_t>(context, node, input1, input2, output,
-                                 requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowEqualWithScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int8_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int8_t>(input2), output_shape,
+                output_data)
+          : reference_ops::EqualWithScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int8_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int8_t>(input2), output_shape,
+                output_data);
      break;
    default:
      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
@ -133,30 +140,100 @@ TfLiteStatus EqualEval(TfLiteContext* context, TfLiteNode* node) {

 // TODO(renjieliu): Refactor the logic to avoid duplications.
 TfLiteStatus NotEqualEval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  bool requires_broadcast = !HaveSameShapes(input1, input2);
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpData* data = static_cast<const OpData*>(node->user_data);
+
+  const TfLiteEvalTensor* input1 =
+      tflite::micro::GetEvalInput(context, node, kInputTensor1);
+  const TfLiteEvalTensor* input2 =
+      tflite::micro::GetEvalInput(context, node, kInputTensor2);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+
+  RuntimeShape input1_shape = tflite::micro::GetTensorShape(input1);
+  RuntimeShape input2_shape = tflite::micro::GetTensorShape(input2);
+  RuntimeShape output_shape = tflite::micro::GetTensorShape(output);
+  bool* output_data = tflite::micro::GetTensorData<bool>(output);
+
+  bool requires_broadcast = !tflite::micro::HaveSameShapes(input1, input2);
  switch (input1->type) {
    case kTfLiteBool:
-      TF_LITE_COMPARISON(bool, NotEqual, requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowNotEqualNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<bool>(input1), input2_shape,
+                tflite::micro::GetTensorData<bool>(input2), output_shape,
+                output_data)
+          : reference_ops::NotEqualNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<bool>(input1), input2_shape,
+                tflite::micro::GetTensorData<bool>(input2), output_shape,
+                output_data);
      break;
    case kTfLiteFloat32:
-      TF_LITE_COMPARISON(float, NotEqual, requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowNotEqualNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<float>(input1), input2_shape,
+                tflite::micro::GetTensorData<float>(input2), output_shape,
+                output_data)
+          : reference_ops::NotEqualNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<float>(input1), input2_shape,
+                tflite::micro::GetTensorData<float>(input2), output_shape,
+                output_data);
      break;
    case kTfLiteInt32:
-      TF_LITE_COMPARISON(int32_t, NotEqual, requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowNotEqualNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int32_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int32_t>(input2), output_shape,
+                output_data)
+          : reference_ops::NotEqualNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int32_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int32_t>(input2), output_shape,
+                output_data);
      break;
    case kTfLiteInt64:
-      TF_LITE_COMPARISON(int64_t, NotEqual, requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowNotEqualNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int64_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int64_t>(input2), output_shape,
+                output_data)
+          : reference_ops::NotEqualNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int64_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int64_t>(input2), output_shape,
+                output_data);
      break;
    case kTfLiteUInt8:
-      EvalQuantizedNotEqual<uint8_t>(context, node, input1, input2, output,
-                                     requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowNotEqualWithScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<uint8_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<uint8_t>(input2), output_shape,
+                output_data)
+          : reference_ops::NotEqualWithScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<uint8_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<uint8_t>(input2), output_shape,
+                output_data);
      break;
    case kTfLiteInt8:
-      EvalQuantizedNotEqual<int8_t>(context, node, input1, input2, output,
-                                    requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowNotEqualWithScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int8_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int8_t>(input2), output_shape,
+                output_data)
+          : reference_ops::NotEqualWithScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int8_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int8_t>(input2), output_shape,
+                output_data);
      break;
    default:
      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
@ -167,27 +244,87 @@ TfLiteStatus NotEqualEval(TfLiteContext* context, TfLiteNode* node) {
 }

 TfLiteStatus GreaterEval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  bool requires_broadcast = !HaveSameShapes(input1, input2);
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpData* data = static_cast<const OpData*>(node->user_data);
+
+  const TfLiteEvalTensor* input1 =
+      tflite::micro::GetEvalInput(context, node, kInputTensor1);
+  const TfLiteEvalTensor* input2 =
+      tflite::micro::GetEvalInput(context, node, kInputTensor2);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+
+  RuntimeShape input1_shape = tflite::micro::GetTensorShape(input1);
+  RuntimeShape input2_shape = tflite::micro::GetTensorShape(input2);
+  RuntimeShape output_shape = tflite::micro::GetTensorShape(output);
+  bool* output_data = tflite::micro::GetTensorData<bool>(output);
+
+  bool requires_broadcast = !tflite::micro::HaveSameShapes(input1, input2);
  switch (input1->type) {
    case kTfLiteFloat32:
-      TF_LITE_COMPARISON(float, Greater, requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowGreaterNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<float>(input1), input2_shape,
+                tflite::micro::GetTensorData<float>(input2), output_shape,
+                output_data)
+          : reference_ops::GreaterNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<float>(input1), input2_shape,
+                tflite::micro::GetTensorData<float>(input2), output_shape,
+                output_data);
      break;
    case kTfLiteInt32:
-      TF_LITE_COMPARISON(int32_t, Greater, requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowGreaterNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int32_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int32_t>(input2), output_shape,
+                output_data)
+          : reference_ops::GreaterNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int32_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int32_t>(input2), output_shape,
+                output_data);
      break;
    case kTfLiteInt64:
-      TF_LITE_COMPARISON(int64_t, Greater, requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowGreaterNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int64_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int64_t>(input2), output_shape,
+                output_data)
+          : reference_ops::GreaterNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int64_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int64_t>(input2), output_shape,
+                output_data);
      break;
    case kTfLiteUInt8:
-      EvalQuantizedGreater<uint8_t>(context, node, input1, input2, output,
-                                    requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowGreaterWithScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<uint8_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<uint8_t>(input2), output_shape,
+                output_data)
+          : reference_ops::GreaterWithScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<uint8_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<uint8_t>(input2), output_shape,
+                output_data);
      break;
    case kTfLiteInt8:
-      EvalQuantizedGreater<int8_t>(context, node, input1, input2, output,
-                                   requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowGreaterWithScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int8_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int8_t>(input2), output_shape,
+                output_data)
+          : reference_ops::GreaterWithScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int8_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int8_t>(input2), output_shape,
+                output_data);
      break;
    default:
      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
@ -198,27 +335,87 @@ TfLiteStatus GreaterEval(TfLiteContext* context, TfLiteNode* node) {
 }

 TfLiteStatus GreaterEqualEval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  bool requires_broadcast = !HaveSameShapes(input1, input2);
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpData* data = static_cast<const OpData*>(node->user_data);
+
+  const TfLiteEvalTensor* input1 =
+      tflite::micro::GetEvalInput(context, node, kInputTensor1);
+  const TfLiteEvalTensor* input2 =
+      tflite::micro::GetEvalInput(context, node, kInputTensor2);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+
+  RuntimeShape input1_shape = tflite::micro::GetTensorShape(input1);
+  RuntimeShape input2_shape = tflite::micro::GetTensorShape(input2);
+  RuntimeShape output_shape = tflite::micro::GetTensorShape(output);
+  bool* output_data = tflite::micro::GetTensorData<bool>(output);
+
+  bool requires_broadcast = !tflite::micro::HaveSameShapes(input1, input2);
  switch (input1->type) {
    case kTfLiteFloat32:
-      TF_LITE_COMPARISON(float, GreaterEqual, requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowGreaterEqualNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<float>(input1), input2_shape,
+                tflite::micro::GetTensorData<float>(input2), output_shape,
+                output_data)
+          : reference_ops::GreaterEqualNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<float>(input1), input2_shape,
+                tflite::micro::GetTensorData<float>(input2), output_shape,
+                output_data);
      break;
    case kTfLiteInt32:
-      TF_LITE_COMPARISON(int32_t, GreaterEqual, requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowGreaterEqualNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int32_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int32_t>(input2), output_shape,
+                output_data)
+          : reference_ops::GreaterEqualNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int32_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int32_t>(input2), output_shape,
+                output_data);
      break;
    case kTfLiteInt64:
-      TF_LITE_COMPARISON(int64_t, GreaterEqual, requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowGreaterEqualNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int64_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int64_t>(input2), output_shape,
+                output_data)
+          : reference_ops::GreaterEqualNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int64_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int64_t>(input2), output_shape,
+                output_data);
      break;
    case kTfLiteUInt8:
-      EvalQuantizedGreaterEqual<uint8_t>(context, node, input1, input2, output,
-                                         requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowGreaterEqualWithScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<uint8_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<uint8_t>(input2), output_shape,
+                output_data)
+          : reference_ops::GreaterEqualWithScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<uint8_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<uint8_t>(input2), output_shape,
+                output_data);
      break;
    case kTfLiteInt8:
-      EvalQuantizedGreaterEqual<int8_t>(context, node, input1, input2, output,
-                                        requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowGreaterEqualWithScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int8_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int8_t>(input2), output_shape,
+                output_data)
+          : reference_ops::GreaterEqualWithScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int8_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int8_t>(input2), output_shape,
+                output_data);
      break;
    default:
      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
@ -229,27 +426,87 @@ TfLiteStatus GreaterEqualEval(TfLiteContext* context, TfLiteNode* node) {
 }

 TfLiteStatus LessEval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  bool requires_broadcast = !HaveSameShapes(input1, input2);
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpData* data = static_cast<const OpData*>(node->user_data);
+
+  const TfLiteEvalTensor* input1 =
+      tflite::micro::GetEvalInput(context, node, kInputTensor1);
+  const TfLiteEvalTensor* input2 =
+      tflite::micro::GetEvalInput(context, node, kInputTensor2);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+
+  RuntimeShape input1_shape = tflite::micro::GetTensorShape(input1);
+  RuntimeShape input2_shape = tflite::micro::GetTensorShape(input2);
+  RuntimeShape output_shape = tflite::micro::GetTensorShape(output);
+  bool* output_data = tflite::micro::GetTensorData<bool>(output);
+
+  bool requires_broadcast = !tflite::micro::HaveSameShapes(input1, input2);
  switch (input1->type) {
    case kTfLiteFloat32:
-      TF_LITE_COMPARISON(float, Less, requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowLessNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<float>(input1), input2_shape,
+                tflite::micro::GetTensorData<float>(input2), output_shape,
+                output_data)
+          : reference_ops::LessNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<float>(input1), input2_shape,
+                tflite::micro::GetTensorData<float>(input2), output_shape,
+                output_data);
      break;
    case kTfLiteInt32:
-      TF_LITE_COMPARISON(int32_t, Less, requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowLessNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int32_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int32_t>(input2), output_shape,
+                output_data)
+          : reference_ops::LessNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int32_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int32_t>(input2), output_shape,
+                output_data);
      break;
    case kTfLiteInt64:
-      TF_LITE_COMPARISON(int64_t, Less, requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowLessNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int64_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int64_t>(input2), output_shape,
+                output_data)
+          : reference_ops::LessNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int64_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int64_t>(input2), output_shape,
+                output_data);
      break;
    case kTfLiteUInt8:
-      EvalQuantizedLess<uint8_t>(context, node, input1, input2, output,
-                                 requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowLessWithScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<uint8_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<uint8_t>(input2), output_shape,
+                output_data)
+          : reference_ops::LessWithScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<uint8_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<uint8_t>(input2), output_shape,
+                output_data);
      break;
    case kTfLiteInt8:
-      EvalQuantizedLess<int8_t>(context, node, input1, input2, output,
-                                requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowLessWithScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int8_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int8_t>(input2), output_shape,
+                output_data)
+          : reference_ops::LessWithScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int8_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int8_t>(input2), output_shape,
+                output_data);
      break;
    default:
      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
@ -260,27 +517,87 @@ TfLiteStatus LessEval(TfLiteContext* context, TfLiteNode* node) {
 }

 TfLiteStatus LessEqualEval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  bool requires_broadcast = !HaveSameShapes(input1, input2);
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpData* data = static_cast<const OpData*>(node->user_data);
+
+  const TfLiteEvalTensor* input1 =
+      tflite::micro::GetEvalInput(context, node, kInputTensor1);
+  const TfLiteEvalTensor* input2 =
+      tflite::micro::GetEvalInput(context, node, kInputTensor2);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+
+  RuntimeShape input1_shape = tflite::micro::GetTensorShape(input1);
+  RuntimeShape input2_shape = tflite::micro::GetTensorShape(input2);
+  RuntimeShape output_shape = tflite::micro::GetTensorShape(output);
+  bool* output_data = tflite::micro::GetTensorData<bool>(output);
+
+  bool requires_broadcast = !tflite::micro::HaveSameShapes(input1, input2);
  switch (input1->type) {
    case kTfLiteFloat32:
-      TF_LITE_COMPARISON(float, LessEqual, requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowLessEqualNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<float>(input1), input2_shape,
+                tflite::micro::GetTensorData<float>(input2), output_shape,
+                output_data)
+          : reference_ops::LessEqualNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<float>(input1), input2_shape,
+                tflite::micro::GetTensorData<float>(input2), output_shape,
+                output_data);
      break;
    case kTfLiteInt32:
-      TF_LITE_COMPARISON(int32_t, LessEqual, requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowLessEqualNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int32_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int32_t>(input2), output_shape,
+                output_data)
+          : reference_ops::LessEqualNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int32_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int32_t>(input2), output_shape,
+                output_data);
      break;
    case kTfLiteInt64:
-      TF_LITE_COMPARISON(int64_t, LessEqual, requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowLessEqualNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int64_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int64_t>(input2), output_shape,
+                output_data)
+          : reference_ops::LessEqualNoScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int64_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int64_t>(input2), output_shape,
+                output_data);
      break;
    case kTfLiteUInt8:
-      EvalQuantizedLessEqual<uint8_t>(context, node, input1, input2, output,
-                                      requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowLessEqualWithScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<uint8_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<uint8_t>(input2), output_shape,
+                output_data)
+          : reference_ops::LessEqualWithScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<uint8_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<uint8_t>(input2), output_shape,
+                output_data);
      break;
    case kTfLiteInt8:
-      EvalQuantizedLessEqual<int8_t>(context, node, input1, input2, output,
-                                     requires_broadcast);
+      requires_broadcast
+          ? reference_ops::Broadcast4DSlowLessEqualWithScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int8_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int8_t>(input2), output_shape,
+                output_data)
+          : reference_ops::LessEqualWithScaling(
+                data->params, input1_shape,
+                tflite::micro::GetTensorData<int8_t>(input1), input2_shape,
+                tflite::micro::GetTensorData<int8_t>(input2), output_shape,
+                output_data);
      break;
    default:
      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
@ -291,78 +608,113 @@ TfLiteStatus LessEqualEval(TfLiteContext* context, TfLiteNode* node) {
 }

 }  // namespace
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  OpData* data = static_cast<OpData*>(node->user_data);
+
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+
+  if (input1->type == kTfLiteUInt8 || input1->type == kTfLiteInt8) {
+    auto input1_offset = -input1->params.zero_point;
+    auto input2_offset = -input2->params.zero_point;
+    const int kLeftShift = 8;
+
+    int32_t input1_multiplier;
+    int input1_shift;
+    QuantizeMultiplierSmallerThanOneExp(
+        static_cast<double>(input1->params.scale), &input1_multiplier,
+        &input1_shift);
+    int32_t input2_multiplier;
+    int input2_shift;
+    QuantizeMultiplierSmallerThanOneExp(
+        static_cast<double>(input2->params.scale), &input2_multiplier,
+        &input2_shift);
+
+    data->params.left_shift = kLeftShift;
+    data->params.input1_offset = input1_offset;
+    data->params.input1_multiplier = input1_multiplier;
+    data->params.input1_shift = input1_shift;
+    data->params.input2_offset = input2_offset;
+    data->params.input2_multiplier = input2_multiplier;
+    data->params.input2_shift = input2_shift;
+  }
+
+  return kTfLiteOk;
+}
+
 }  // namespace comparisons

-TfLiteRegistration* Register_EQUAL() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
-                                 /*free=*/nullptr,
-                                 /*prepare=*/nullptr,
-                                 /*invoke=*/comparisons::EqualEval,
-                                 /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
-                                 /*version=*/0};
-  return &r;
+TfLiteRegistration Register_EQUAL() {
+  return {/*init=*/comparisons::Init,
+          /*free=*/nullptr,
+          /*prepare=*/comparisons::Prepare,
+          /*invoke=*/comparisons::EqualEval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
 }

-TfLiteRegistration* Register_NOT_EQUAL() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
-                                 /*free=*/nullptr,
-                                 /*prepare=*/nullptr,
-                                 /*invoke=*/comparisons::NotEqualEval,
-                                 /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
-                                 /*version=*/0};
-  return &r;
+TfLiteRegistration Register_NOT_EQUAL() {
+  return {/*init=*/comparisons::Init,
+          /*free=*/nullptr,
+          /*prepare=*/comparisons::Prepare,
+          /*invoke=*/comparisons::NotEqualEval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
 }

-TfLiteRegistration* Register_GREATER() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
-                                 /*free=*/nullptr,
-                                 /*prepare=*/nullptr,
-                                 /*invoke=*/comparisons::GreaterEval,
-                                 /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
-                                 /*version=*/0};
-  return &r;
+TfLiteRegistration Register_GREATER() {
+  return {/*init=*/comparisons::Init,
+          /*free=*/nullptr,
+          /*prepare=*/comparisons::Prepare,
+          /*invoke=*/comparisons::GreaterEval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
 }

-TfLiteRegistration* Register_GREATER_EQUAL() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
-                                 /*free=*/nullptr,
-                                 /*prepare=*/nullptr,
-                                 /*invoke=*/comparisons::GreaterEqualEval,
-                                 /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
-                                 /*version=*/0};
-  return &r;
+TfLiteRegistration Register_GREATER_EQUAL() {
+  return {/*init=*/comparisons::Init,
+          /*free=*/nullptr,
+          /*prepare=*/comparisons::Prepare,
+          /*invoke=*/comparisons::GreaterEqualEval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
 }

-TfLiteRegistration* Register_LESS() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
-                                 /*free=*/nullptr,
-                                 /*prepare=*/nullptr,
-                                 /*invoke=*/comparisons::LessEval,
-                                 /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
-                                 /*version=*/0};
-  return &r;
+TfLiteRegistration Register_LESS() {
+  return {/*init=*/comparisons::Init,
+          /*free=*/nullptr,
+          /*prepare=*/comparisons::Prepare,
+          /*invoke=*/comparisons::LessEval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
 }

-TfLiteRegistration* Register_LESS_EQUAL() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
-                                 /*free=*/nullptr,
-                                 /*prepare=*/nullptr,
-                                 /*invoke=*/comparisons::LessEqualEval,
-                                 /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
-                                 /*version=*/0};
-  return &r;
+TfLiteRegistration Register_LESS_EQUAL() {
+  return {/*init=*/comparisons::Init,
+          /*free=*/nullptr,
+          /*prepare=*/comparisons::Prepare,
+          /*invoke=*/comparisons::LessEqualEval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
 }

 }  // namespace micro
--- a/libs/tensorflow/tensorflow/lite/micro/kernels/concatenation.cc
+++ b/libs/tensorflow/tensorflow/lite/micro/kernels/concatenation.cc
@ -18,10 +18,11 @@ limitations under the License.

 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/portable_tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"

 namespace tflite {
 namespace ops {
@ -31,6 +32,104 @@ namespace concatenation {
 constexpr int kMaxInputNum = 10;  // Maximum number of input tensors
 constexpr int kOutputTensor = 0;

+struct OpData {
+  ConcatenationParams params;
+};
+
+// Handles negative axis index, coerces to positive index value.
+inline int CalculatePositiveAxis(int axis, const TfLiteTensor* output_tensor) {
+  if (axis >= 0) {
+    return axis;
+  } else {
+    return NumDimensions(output_tensor) + axis;
+  }
+}
+
+// The following functions are helpers to get tensor data in the format that the
+// reference op implementation expects. They provide the same functionality as
+// class VectorOfTensors and class VectorOfQuantizedTensors in TFLite.
+
+// Gets shapes from a list of tensors.
+inline void GetAllInputTensorShapes(const TfLiteContext* context,
+                                    const TfLiteNode* node,
+                                    RuntimeShape all_shapes[kMaxInputNum]) {
+  TFLITE_DCHECK(context != nullptr);
+  TFLITE_DCHECK(node != nullptr);
+  for (int i = 0; i < node->inputs->size; ++i) {
+    const TfLiteEvalTensor* t = tflite::micro::GetEvalInput(context, node, i);
+    RuntimeShape shape = tflite::micro::GetTensorShape(t);
+    all_shapes[i].ReplaceWith(shape.DimensionsCount(), shape.DimsData());
+  }
+}
+
+// Get shape pointers from a list of shapes.
+inline void GetShapesPointers(const RuntimeShape* shapes, size_t num,
+                              const RuntimeShape* pointers[]) {
+  for (size_t i = 0; i < num; ++i) {
+    pointers[i] = &shapes[i];
+  }
+}
+
+// Gets data pointers from a list of tensors.
+template <typename T>
+inline void GetAllInputTensorData(const TfLiteContext* context,
+                                  const TfLiteNode* node,
+                                  T* all_data[kMaxInputNum]) {
+  TFLITE_DCHECK(context != nullptr);
+  TFLITE_DCHECK(node != nullptr);
+  for (int i = 0; i < node->inputs->size; ++i) {
+    const TfLiteEvalTensor* t = tflite::micro::GetEvalInput(context, node, i);
+    all_data[i] = tflite::micro::GetTensorData<T>(t);
+  }
+}
+
+template <typename data_type>
+void EvalUnquantized(TfLiteContext* context, TfLiteNode* node) {
+  // Collect the shapes and data pointer of input tensors
+  RuntimeShape inputs_shape[kMaxInputNum];
+  const RuntimeShape* inputs_shape_ptr[kMaxInputNum];
+  const data_type* inputs_data[kMaxInputNum];
+  GetAllInputTensorShapes(context, node, inputs_shape);
+  GetShapesPointers(inputs_shape, node->inputs->size, inputs_shape_ptr);
+  GetAllInputTensorData(context, node, inputs_data);
+
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpData* data = static_cast<const OpData*>(node->user_data);
+
+  reference_ops::Concatenation(data->params, inputs_shape_ptr, inputs_data,
+                               tflite::micro::GetTensorShape(output),
+                               tflite::micro::GetTensorData<data_type>(output));
+}
+
+void EvalQuantizedUInt8(TfLiteContext* context, TfLiteNode* node) {
+  // Collect the shapes and data pointer of input tensors
+  RuntimeShape inputs_shape[kMaxInputNum];
+  const RuntimeShape* inputs_shape_ptr[kMaxInputNum];
+  const uint8_t* inputs_data[kMaxInputNum];
+  GetAllInputTensorShapes(context, node, inputs_shape);
+  GetShapesPointers(inputs_shape, node->inputs->size, inputs_shape_ptr);
+  GetAllInputTensorData(context, node, inputs_data);
+
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpData* data = static_cast<const OpData*>(node->user_data);
+
+  reference_ops::ConcatenationWithScaling(
+      data->params, inputs_shape_ptr, inputs_data,
+      tflite::micro::GetTensorShape(output),
+      tflite::micro::GetTensorData<uint8_t>(output));
+}
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
+}
+
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  // This function only checks the types. Additional shape validations are
  // performed in the reference implementation called during Eval().
@ -63,125 +162,63 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
      TF_LITE_KERNEL_LOG(
          context,
          "Op Concatenation does not currently support num dimensions >4 "
-          "Tensor '%s' has %d dimensions.",
-          input->name, num_dimensions);
+          "Tensor has %d dimensions.",
+          num_dimensions);
      return kTfLiteError;
    }
  }

+  // Calculate OpData.
+  TFLITE_DCHECK(node->user_data != nullptr);
+  OpData* data = static_cast<OpData*>(node->user_data);
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  switch (output_type) {  // Already know in/outtypes are same.
+    case kTfLiteFloat32:
+    case kTfLiteInt32:
+    case kTfLiteInt64: {
+      data->params.axis = CalculatePositiveAxis(params->axis, output);
+      data->params.inputs_count = node->inputs->size;
+      break;
+    }
+    case kTfLiteUInt8:
+    case kTfLiteInt8: {
+      data->params.axis = CalculatePositiveAxis(params->axis, output);
+      data->params.inputs_count = node->inputs->size;
+
+      float* input_scales =
+          reinterpret_cast<float*>(context->AllocatePersistentBuffer(
+              context, node->inputs->size * sizeof(float)));
+
+      int32_t* input_zero_points =
+          reinterpret_cast<int32_t*>(context->AllocatePersistentBuffer(
+              context, node->inputs->size * sizeof(int32_t)));
+
+      // Allocate persistent scale and zeropoint buffers.
+      // Store input scale and zero point values in OpParams:
+      for (int i = 0; i < node->inputs->size; ++i) {
+        const TfLiteTensor* t = GetInput(context, node, i);
+        input_scales[i] = t->params.scale;
+        input_zero_points[i] = t->params.zero_point;
+      }
+
+      data->params.input_scale = input_scales;
+      data->params.input_zeropoint = input_zero_points;
+      data->params.output_zeropoint = output->params.zero_point;
+      data->params.output_scale = output->params.scale;
+      break;
+    }
+    default:
+      TF_LITE_KERNEL_LOG(
+          context, "Op Concatenation does not currently support Type '%s'.",
+          TfLiteTypeGetName(output_type));
+      return kTfLiteError;
+  }
+
  return kTfLiteOk;
 }

-// Handles negative axis index, coerces to positive index value.
-inline int CalculatePositiveAxis(int axis, const TfLiteTensor* output_tensor) {
-  if (axis >= 0) {
-    return axis;
-  } else {
-    return NumDimensions(output_tensor) + axis;
-  }
-}
-
-// The following functions are helpers to get tensor data in the format that the
-// reference op implementation expects. They provide the same functionality as
-// class VectorOfTensors and class VectorOfQuantizedTensors in TFLite.
-
-// Gets shapes from a list of tensors.
-inline void GetAllTensorShapes(const TfLiteContext& context,
-                               const TfLiteIntArray& tensor_list,
-                               RuntimeShape all_shapes[kMaxInputNum]) {
-  for (int i = 0; i < tensor_list.size; ++i) {
-    const TfLiteTensor* t = &context.tensors[tensor_list.data[i]];
-    RuntimeShape shape = GetTensorShape(t);
-    all_shapes[i].ReplaceWith(shape.DimensionsCount(), shape.DimsData());
-  }
-}
-
-// Get shape pointers from a list of shapes.
-inline void GetShapesPointers(const RuntimeShape* shapes, size_t num,
-                              const RuntimeShape* pointers[]) {
-  for (size_t i = 0; i < num; ++i) {
-    pointers[i] = &shapes[i];
-  }
-}
-
-// Gets data pointers from a list of tensors.
-template <typename T>
-inline void GetAllTensorData(const TfLiteContext& context,
-                             const TfLiteIntArray& tensor_list,
-                             T* all_data[kMaxInputNum]) {
-  for (int i = 0; i < tensor_list.size; ++i) {
-    const TfLiteTensor* t = &context.tensors[tensor_list.data[i]];
-    all_data[i] = GetTensorData<T>(t);
-  }
-}
-
-// Gets scale and zero point from a list of tensors
-inline void GetAllQuantizationParam(const TfLiteContext& context,
-                                    const TfLiteIntArray& tensor_list,
-                                    float scales[kMaxInputNum],
-                                    int32 zero_points[kMaxInputNum]) {
-  for (int i = 0; i < tensor_list.size; ++i) {
-    const TfLiteTensor* t = &context.tensors[tensor_list.data[i]];
-    scales[i] = t->params.scale;
-    zero_points[i] = t->params.zero_point;
-  }
-}
-
-template <typename data_type>
-void EvalUnquantized(TfLiteContext* context, TfLiteNode* node) {
-  // Collect the shapes and data pointer of input tensors
-  RuntimeShape inputs_shape[kMaxInputNum];
-  const RuntimeShape* inputs_shape_ptr[kMaxInputNum];
-  const data_type* inputs_data[kMaxInputNum];
-  GetAllTensorShapes(*context, *node->inputs, inputs_shape);
-  GetShapesPointers(inputs_shape, node->inputs->size, inputs_shape_ptr);
-  GetAllTensorData(*context, *node->inputs, inputs_data);
-
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-
-  const TfLiteConcatenationParams* params =
-      reinterpret_cast<TfLiteConcatenationParams*>(node->builtin_data);
-
-  ConcatenationParams op_params;
-  op_params.axis = CalculatePositiveAxis(params->axis, output);
-  op_params.inputs_count = NumInputs(node);
-
-  reference_ops::Concatenation(op_params, inputs_shape_ptr, inputs_data,
-                               GetTensorShape(output),
-                               GetTensorData<data_type>(output));
-}
-
-void EvalQuantizedUInt8(TfLiteContext* context, TfLiteNode* node) {
-  // Collect the shapes and data pointer of input tensors
-  RuntimeShape inputs_shape[kMaxInputNum];
-  const RuntimeShape* inputs_shape_ptr[kMaxInputNum];
-  const uint8_t* inputs_data[kMaxInputNum];
-  float inputs_scale[kMaxInputNum];
-  int32 inputs_zero_point[kMaxInputNum];
-  GetAllTensorShapes(*context, *node->inputs, inputs_shape);
-  GetShapesPointers(inputs_shape, node->inputs->size, inputs_shape_ptr);
-  GetAllTensorData(*context, *node->inputs, inputs_data);
-  GetAllQuantizationParam(*context, *node->inputs, inputs_scale,
-                          inputs_zero_point);
-
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-
-  const TfLiteConcatenationParams* params =
-      reinterpret_cast<TfLiteConcatenationParams*>(node->builtin_data);
-
-  ConcatenationParams op_params;
-  op_params.axis = CalculatePositiveAxis(params->axis, output);
-  op_params.inputs_count = NumInputs(node);
-  op_params.input_zeropoint = inputs_zero_point;
-  op_params.input_scale = inputs_scale;
-  op_params.output_zeropoint = output->params.zero_point;
-  op_params.output_scale = output->params.scale;
-
-  reference_ops::ConcatenationWithScaling(op_params, inputs_shape_ptr,
-                                          inputs_data, GetTensorShape(output),
-                                          GetTensorData<uint8>(output));
-}
-
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
  TfLiteType output_type = GetOutput(context, node, kOutputTensor)->type;

@ -214,16 +251,15 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {

 }  // namespace concatenation

-TfLiteRegistration* Register_CONCATENATION() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
-                                 /*free=*/nullptr,
-                                 /*prepare=*/concatenation::Prepare,
-                                 /*invoke=*/concatenation::Eval,
-                                 /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
-                                 /*version=*/0};
-  return &r;
+TfLiteRegistration Register_CONCATENATION() {
+  return {/*init=*/concatenation::Init,
+          /*free=*/nullptr,
+          /*prepare=*/concatenation::Prepare,
+          /*invoke=*/concatenation::Eval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
 }

 }  // namespace micro
--- a/libs/tensorflow/tensorflow/lite/micro/kernels/conv.cc
+++ b/libs/tensorflow/tensorflow/lite/micro/kernels/conv.cc
@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/padding.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"

 namespace tflite {
 namespace ops {
@ -42,6 +43,12 @@ constexpr int kConvQuantizedDimension = 0;

 struct OpData {
  TfLitePaddingValues padding;
+
+  // Cached tensor zero point values for quantized operations.
+  int32_t input_zero_point;
+  int32_t filter_zero_point;
+  int32_t output_zero_point;
+
  // The scaling factor from input to output (aka the 'real multiplier') can
  // be represented as a fixed point multiplier plus a left shift.
  int32_t output_multiplier;
@ -109,12 +116,7 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,

 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
-  void* data = nullptr;
-  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
-      kTfLiteError) {
-    return nullptr;
-  }
-  return data;
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
 }

 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
@ -137,12 +139,12 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {

  // Dynimically allocate per-channel quantization parameters.
  const int num_channels = filter->dims->data[kConvQuantizedDimension];
-  TF_LITE_ENSURE_STATUS(context->AllocatePersistentBuffer(
-      context, num_channels * sizeof(int32_t),
-      reinterpret_cast<void**>(&data->per_channel_output_multiplier)));
-  TF_LITE_ENSURE_STATUS(context->AllocatePersistentBuffer(
-      context, num_channels * sizeof(int32_t),
-      reinterpret_cast<void**>(&data->per_channel_output_shift)));
+  data->per_channel_output_multiplier =
+      reinterpret_cast<int32_t*>(context->AllocatePersistentBuffer(
+          context, num_channels * sizeof(int32_t)));
+  data->per_channel_output_shift =
+      reinterpret_cast<int32_t*>(context->AllocatePersistentBuffer(
+          context, num_channels * sizeof(int32_t)));

  // All per-channel quantized tensors need valid zero point and scale arrays.
  if (input->type == kTfLiteInt8) {
@ -163,19 +165,26 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                      affine_quantization->zero_point->size);
  }

-  return CalculateOpData(context, node, params, input_width, input_height,
-                         filter_width, filter_height, output_width,
-                         output_height, input->type, data);
+  TF_LITE_ENSURE_STATUS(CalculateOpData(
+      context, node, params, input_width, input_height, filter_width,
+      filter_height, output_width, output_height, input->type, data));
+
+  data->input_zero_point = input->params.zero_point;
+  data->filter_zero_point = filter->params.zero_point;
+  data->output_zero_point = output->params.zero_point;
+
+  return kTfLiteOk;
 }  // namespace conv

 void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
                   TfLiteConvParams* params, const OpData& data,
-                   const TfLiteTensor* input, const TfLiteTensor* filter,
-                   const TfLiteTensor* bias, TfLiteTensor* im2col,
-                   TfLiteTensor* hwcn_weights, TfLiteTensor* output) {
-  const int32_t input_offset = -input->params.zero_point;
-  const int32_t filter_offset = -filter->params.zero_point;
-  const int32_t output_offset = output->params.zero_point;
+                   const TfLiteEvalTensor* input,
+                   const TfLiteEvalTensor* filter, const TfLiteEvalTensor* bias,
+                   TfLiteEvalTensor* im2col, TfLiteEvalTensor* hwcn_weights,
+                   TfLiteEvalTensor* output) {
+  const int32_t input_offset = -data.input_zero_point;
+  const int32_t filter_offset = -data.filter_zero_point;
+  const int32_t output_offset = data.output_zero_point;

  // TODO(b/154032858): Investigate removing extra copies.
  ConvParams op_params;
@ -193,24 +202,29 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
  op_params.output_shift = -data.output_shift;
  op_params.quantized_activation_min = data.output_activation_min;
  op_params.quantized_activation_max = data.output_activation_max;
-  reference_ops::Conv(op_params, GetTensorShape(input),
-                      GetTensorData<uint8_t>(input), GetTensorShape(filter),
-                      GetTensorData<uint8_t>(filter), GetTensorShape(bias),
-                      GetTensorData<int32_t>(bias), GetTensorShape(output),
-                      GetTensorData<uint8_t>(output), GetTensorShape(im2col),
-                      GetTensorData<uint8_t>(im2col), nullptr);
+  reference_ops::Conv(op_params, tflite::micro::GetTensorShape(input),
+                      tflite::micro::GetTensorData<uint8_t>(input),
+                      tflite::micro::GetTensorShape(filter),
+                      tflite::micro::GetTensorData<uint8_t>(filter),
+                      tflite::micro::GetTensorShape(bias),
+                      tflite::micro::GetTensorData<int32_t>(bias),
+                      tflite::micro::GetTensorShape(output),
+                      tflite::micro::GetTensorData<uint8_t>(output),
+                      tflite::micro::GetTensorShape(im2col),
+                      tflite::micro::GetTensorData<uint8_t>(im2col), nullptr);
 }

 void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
                             TfLiteConvParams* params, const OpData& data,
-                             const TfLiteTensor* input,
-                             const TfLiteTensor* filter,
-                             const TfLiteTensor* bias, TfLiteTensor* output,
-                             TfLiteTensor* im2col) {
+                             const TfLiteEvalTensor* input,
+                             const TfLiteEvalTensor* filter,
+                             const TfLiteEvalTensor* bias,
+                             TfLiteEvalTensor* output,
+                             TfLiteEvalTensor* im2col) {
  // TODO(b/154032858): Investigate removing extra copies.
  ConvParams op_params;
-  op_params.input_offset = -input->params.zero_point;
-  op_params.output_offset = output->params.zero_point;
+  op_params.input_offset = -data.input_zero_point;
+  op_params.output_offset = data.output_zero_point;
  op_params.stride_height = params->stride_height;
  op_params.stride_width = params->stride_width;
  op_params.dilation_height_factor = params->dilation_height_factor;
@ -222,18 +236,21 @@ void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,

  reference_integer_ops::ConvPerChannel(
      op_params, data.per_channel_output_multiplier,
-      data.per_channel_output_shift, GetTensorShape(input),
-      GetTensorData<int8>(input), GetTensorShape(filter),
-      GetTensorData<int8>(filter), GetTensorShape(bias),
-      GetTensorData<int32>(bias), GetTensorShape(output),
-      GetTensorData<int8>(output));
+      data.per_channel_output_shift, tflite::micro::GetTensorShape(input),
+      tflite::micro::GetTensorData<int8_t>(input),
+      tflite::micro::GetTensorShape(filter),
+      tflite::micro::GetTensorData<int8_t>(filter),
+      tflite::micro::GetTensorShape(bias),
+      tflite::micro::GetTensorData<int32_t>(bias),
+      tflite::micro::GetTensorShape(output),
+      tflite::micro::GetTensorData<int8_t>(output));
 }

 void EvalFloat(TfLiteContext* context, TfLiteNode* node,
               TfLiteConvParams* params, const OpData& data,
-               const TfLiteTensor* input, const TfLiteTensor* filter,
-               const TfLiteTensor* bias, TfLiteTensor* im2col,
-               TfLiteTensor* hwcn_weights, TfLiteTensor* output) {
+               const TfLiteEvalTensor* input, const TfLiteEvalTensor* filter,
+               const TfLiteEvalTensor* bias, TfLiteEvalTensor* im2col,
+               TfLiteEvalTensor* hwcn_weights, TfLiteEvalTensor* output) {
  float output_activation_min, output_activation_max;
  CalculateActivationRange(params->activation, &output_activation_min,
                           &output_activation_max);
@ -249,21 +266,31 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node,
  op_params.float_activation_min = output_activation_min;
  op_params.float_activation_max = output_activation_max;

-  reference_ops::Conv(op_params, GetTensorShape(input),
-                      GetTensorData<float>(input), GetTensorShape(filter),
-                      GetTensorData<float>(filter), GetTensorShape(bias),
-                      GetTensorData<float>(bias), GetTensorShape(output),
-                      GetTensorData<float>(output), GetTensorShape(im2col),
-                      GetTensorData<float>(im2col));
+  reference_ops::Conv(op_params, tflite::micro::GetTensorShape(input),
+                      tflite::micro::GetTensorData<float>(input),
+                      tflite::micro::GetTensorShape(filter),
+                      tflite::micro::GetTensorData<float>(filter),
+                      tflite::micro::GetTensorShape(bias),
+                      tflite::micro::GetTensorData<float>(bias),
+                      tflite::micro::GetTensorShape(output),
+                      tflite::micro::GetTensorData<float>(output),
+                      tflite::micro::GetTensorShape(im2col),
+                      tflite::micro::GetTensorData<float>(im2col));
 }

 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
  auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data);

-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
-  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  const TfLiteEvalTensor* filter =
+      tflite::micro::GetEvalInput(context, node, kFilterTensor);
+  const TfLiteEvalTensor* bias =
+      (NumInputs(node) == 3)
+          ? tflite::micro::GetEvalInput(context, node, kBiasTensor)
+          : nullptr;
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);

  TFLITE_DCHECK(node->user_data != nullptr);
  const OpData& data = *(static_cast<const OpData*>(node->user_data));
@ -291,16 +318,15 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {

 }  // namespace conv

-TfLiteRegistration* Register_CONV_2D() {
-  static TfLiteRegistration r = {/*init=*/conv::Init,
-                                 /*free=*/nullptr,
-                                 /*prepare=*/conv::Prepare,
-                                 /*invoke=*/conv::Eval,
-                                 /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
-                                 /*version=*/0};
-  return &r;
+TfLiteRegistration Register_CONV_2D() {
+  return {/*init=*/conv::Init,
+          /*free=*/nullptr,
+          /*prepare=*/conv::Prepare,
+          /*invoke=*/conv::Eval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
 }

 }  // namespace micro
--- a/libs/tensorflow/tensorflow/lite/micro/kernels/depthwise_conv.cc
+++ b/libs/tensorflow/tensorflow/lite/micro/kernels/depthwise_conv.cc
@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/padding.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"

 namespace tflite {
 namespace ops {
@ -42,6 +43,12 @@ constexpr int kDepthwiseConvQuantizedDimension = 3;

 struct OpData {
  TfLitePaddingValues padding;
+
+  // Cached tensor zero point values for quantized operations.
+  int32_t input_zero_point;
+  int32_t filter_zero_point;
+  int32_t output_zero_point;
+
  // The scaling factor from input to output (aka the 'real multiplier') can
  // be represented as a fixed point multiplier plus a left shift.
  int32_t output_multiplier;
@ -95,12 +102,7 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,

 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
-  void* data = nullptr;
-  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
-      kTfLiteError) {
-    return nullptr;
-  }
-  return data;
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
 }

 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
@ -111,6 +113,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
      reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
  OpData* data = static_cast<OpData*>(node->user_data);

+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);

@ -120,16 +123,16 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  int filter_width = SizeOfDimension(filter, 2);
  int filter_height = SizeOfDimension(filter, 1);

-  // Per channel quantization is only needed for int8 inference. For other
+  // Per channel quantization is only needed for int8_t inference. For other
  // quantized types, only a single scale and zero point is needed.
  const int num_channels = filter->dims->data[kDepthwiseConvQuantizedDimension];
  // Dynimically allocate per-channel quantization parameters.
-  TF_LITE_ENSURE_STATUS(context->AllocatePersistentBuffer(
-      context, num_channels * sizeof(int32_t),
-      reinterpret_cast<void**>(&data->per_channel_output_multiplier)));
-  TF_LITE_ENSURE_STATUS(context->AllocatePersistentBuffer(
-      context, num_channels * sizeof(int32_t),
-      reinterpret_cast<void**>(&data->per_channel_output_shift)));
+  data->per_channel_output_multiplier =
+      reinterpret_cast<int32_t*>(context->AllocatePersistentBuffer(
+          context, num_channels * sizeof(int32_t)));
+  data->per_channel_output_shift =
+      reinterpret_cast<int32_t*>(context->AllocatePersistentBuffer(
+          context, num_channels * sizeof(int32_t)));

  // All per-channel quantized tensors need valid zero point and scale arrays.
  if (input->type == kTfLiteInt8) {
@ -150,14 +153,21 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                      affine_quantization->zero_point->size);
  }

-  return CalculateOpData(context, node, params, width, height, filter_width,
-                         filter_height, data_type, data);
+  TF_LITE_ENSURE_STATUS(CalculateOpData(context, node, params, width, height,
+                                        filter_width, filter_height, data_type,
+                                        data));
+
+  data->input_zero_point = input->params.zero_point;
+  data->filter_zero_point = filter->params.zero_point;
+  data->output_zero_point = output->params.zero_point;
+
+  return kTfLiteOk;
 }

 void EvalFloat(TfLiteContext* context, TfLiteNode* node,
-               TfLiteDepthwiseConvParams* params, const OpData* data,
-               const TfLiteTensor* input, const TfLiteTensor* filter,
-               const TfLiteTensor* bias, TfLiteTensor* output) {
+               TfLiteDepthwiseConvParams* params, const OpData& data,
+               const TfLiteEvalTensor* input, const TfLiteEvalTensor* filter,
+               const TfLiteEvalTensor* bias, TfLiteEvalTensor* output) {
  float output_activation_min, output_activation_max;
  CalculateActivationRange(params->activation, &output_activation_min,
                           &output_activation_max);
@ -165,8 +175,8 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node,
  tflite::DepthwiseParams op_params;
  // Padding type is ignored, but still set.
  op_params.padding_type = PaddingType::kSame;
-  op_params.padding_values.width = data->padding.width;
-  op_params.padding_values.height = data->padding.height;
+  op_params.padding_values.width = data.padding.width;
+  op_params.padding_values.height = data.padding.height;
  op_params.stride_width = params->stride_width;
  op_params.stride_height = params->stride_height;
  op_params.dilation_width_factor = params->dilation_width_factor;
@ -176,74 +186,87 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node,
  op_params.float_activation_max = output_activation_max;

  tflite::reference_ops::DepthwiseConv(
-      op_params, GetTensorShape(input), GetTensorData<float>(input),
-      GetTensorShape(filter), GetTensorData<float>(filter),
-      GetTensorShape(bias), GetTensorData<float>(bias), GetTensorShape(output),
-      GetTensorData<float>(output));
+      op_params, tflite::micro::GetTensorShape(input),
+      tflite::micro::GetTensorData<float>(input),
+      tflite::micro::GetTensorShape(filter),
+      tflite::micro::GetTensorData<float>(filter),
+      tflite::micro::GetTensorShape(bias),
+      tflite::micro::GetTensorData<float>(bias),
+      tflite::micro::GetTensorShape(output),
+      tflite::micro::GetTensorData<float>(output));
 }

 void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
                             TfLiteDepthwiseConvParams* params,
-                             const OpData* data, const TfLiteTensor* input,
-                             const TfLiteTensor* filter,
-                             const TfLiteTensor* bias, TfLiteTensor* output) {
+                             const OpData& data, const TfLiteEvalTensor* input,
+                             const TfLiteEvalTensor* filter,
+                             const TfLiteEvalTensor* bias,
+                             TfLiteEvalTensor* output) {
  DepthwiseParams op_params;
  op_params.padding_type = PaddingType::kSame;
-  op_params.padding_values.width = data->padding.width;
-  op_params.padding_values.height = data->padding.height;
+  op_params.padding_values.width = data.padding.width;
+  op_params.padding_values.height = data.padding.height;
  op_params.stride_width = params->stride_width;
  op_params.stride_height = params->stride_height;
  op_params.dilation_width_factor = params->dilation_width_factor;
  op_params.dilation_height_factor = params->dilation_height_factor;
  op_params.depth_multiplier = params->depth_multiplier;
-  op_params.input_offset = -input->params.zero_point;
+  op_params.input_offset = -data.input_zero_point;
  op_params.weights_offset = 0;
-  op_params.output_offset = output->params.zero_point;
+  op_params.output_offset = data.output_zero_point;
  // TODO(b/130439627): Use calculated value for clamping.
  op_params.quantized_activation_min = std::numeric_limits<int8_t>::min();
  op_params.quantized_activation_max = std::numeric_limits<int8_t>::max();

  reference_integer_ops::DepthwiseConvPerChannel(
-      op_params, data->per_channel_output_multiplier,
-      data->per_channel_output_shift, GetTensorShape(input),
-      GetTensorData<int8>(input), GetTensorShape(filter),
-      GetTensorData<int8>(filter), GetTensorShape(bias),
-      GetTensorData<int32>(bias), GetTensorShape(output),
-      GetTensorData<int8>(output));
+      op_params, data.per_channel_output_multiplier,
+      data.per_channel_output_shift, tflite::micro::GetTensorShape(input),
+      tflite::micro::GetTensorData<int8_t>(input),
+      tflite::micro::GetTensorShape(filter),
+      tflite::micro::GetTensorData<int8_t>(filter),
+      tflite::micro::GetTensorShape(bias),
+      tflite::micro::GetTensorData<int32_t>(bias),
+      tflite::micro::GetTensorShape(output),
+      tflite::micro::GetTensorData<int8_t>(output));
 }

 void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                   TfLiteDepthwiseConvParams* params, const OpData* data,
-                   const TfLiteTensor* input, const TfLiteTensor* filter,
-                   const TfLiteTensor* bias, TfLiteTensor* output) {
-  const int32_t input_offset = -input->params.zero_point;
-  const int32_t filter_offset = -filter->params.zero_point;
-  const int32_t output_offset = output->params.zero_point;
+                   TfLiteDepthwiseConvParams* params, const OpData& data,
+                   const TfLiteEvalTensor* input,
+                   const TfLiteEvalTensor* filter, const TfLiteEvalTensor* bias,
+                   TfLiteEvalTensor* output) {
+  const int32_t input_offset = -data.input_zero_point;
+  const int32_t filter_offset = -data.filter_zero_point;
+  const int32_t output_offset = data.output_zero_point;

  tflite::DepthwiseParams op_params;
  // Padding type is ignored, but still set.
  op_params.padding_type = PaddingType::kSame;
-  op_params.padding_values.width = data->padding.width;
-  op_params.padding_values.height = data->padding.height;
+  op_params.padding_values.width = data.padding.width;
+  op_params.padding_values.height = data.padding.height;
  op_params.stride_width = params->stride_width;
  op_params.stride_height = params->stride_height;
  op_params.dilation_width_factor = params->dilation_width_factor;
  op_params.dilation_height_factor = params->dilation_height_factor;
  op_params.depth_multiplier = params->depth_multiplier;
-  op_params.quantized_activation_min = data->output_activation_min;
-  op_params.quantized_activation_max = data->output_activation_max;
+  op_params.quantized_activation_min = data.output_activation_min;
+  op_params.quantized_activation_max = data.output_activation_max;
  op_params.input_offset = input_offset;
  op_params.weights_offset = filter_offset;
  op_params.output_offset = output_offset;
-  op_params.output_multiplier = data->output_multiplier;
+  op_params.output_multiplier = data.output_multiplier;
  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
-  op_params.output_shift = -data->output_shift;
+  op_params.output_shift = -data.output_shift;

  tflite::reference_ops::DepthwiseConv(
-      op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
-      GetTensorShape(filter), GetTensorData<uint8_t>(filter),
-      GetTensorShape(bias), GetTensorData<int32_t>(bias),
-      GetTensorShape(output), GetTensorData<uint8_t>(output));
+      op_params, tflite::micro::GetTensorShape(input),
+      tflite::micro::GetTensorData<uint8_t>(input),
+      tflite::micro::GetTensorShape(filter),
+      tflite::micro::GetTensorData<uint8_t>(filter),
+      tflite::micro::GetTensorShape(bias),
+      tflite::micro::GetTensorData<int32_t>(bias),
+      tflite::micro::GetTensorShape(output),
+      tflite::micro::GetTensorData<uint8_t>(output));
 }

 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
@ -254,24 +277,29 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
      reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
  const OpData& data = *(static_cast<const OpData*>(node->user_data));

-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
-  const TfLiteTensor* bias =
-      (NumInputs(node) == 3) ? GetInput(context, node, kBiasTensor) : nullptr;
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  const TfLiteEvalTensor* filter =
+      tflite::micro::GetEvalInput(context, node, kFilterTensor);
+  const TfLiteEvalTensor* bias =
+      (NumInputs(node) == 3)
+          ? tflite::micro::GetEvalInput(context, node, kBiasTensor)
+          : nullptr;

  // TODO(aselle): Consider whether float conv and quantized conv should be
  // separate ops to avoid dispatch overhead here.
  switch (input->type) {  // Already know in/out types are same.
    case kTfLiteFloat32:
-      EvalFloat(context, node, params, &data, input, filter, bias, output);
+      EvalFloat(context, node, params, data, input, filter, bias, output);
      break;
    case kTfLiteInt8:
-      EvalQuantizedPerChannel(context, node, params, &data, input, filter, bias,
+      EvalQuantizedPerChannel(context, node, params, data, input, filter, bias,
                              output);
      break;
    case kTfLiteUInt8:
-      EvalQuantized(context, node, params, &data, input, filter, bias, output);
+      EvalQuantized(context, node, params, data, input, filter, bias, output);
      break;
    default:
      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
@ -283,16 +311,15 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {

 }  // namespace depthwise_conv

-TfLiteRegistration* Register_DEPTHWISE_CONV_2D() {
-  static TfLiteRegistration r = {/*init=*/depthwise_conv::Init,
-                                 /*free=*/nullptr,
-                                 /*prepare=*/depthwise_conv::Prepare,
-                                 /*invoke=*/depthwise_conv::Eval,
-                                 /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
-                                 /*version=*/0};
-  return &r;
+TfLiteRegistration Register_DEPTHWISE_CONV_2D() {
+  return {/*init=*/depthwise_conv::Init,
+          /*free=*/nullptr,
+          /*prepare=*/depthwise_conv::Prepare,
+          /*invoke=*/depthwise_conv::Eval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
 }

 }  // namespace micro
--- a/libs/tensorflow/tensorflow/lite/micro/kernels/dequantize.cc
+++ b/libs/tensorflow/tensorflow/lite/micro/kernels/dequantize.cc
@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/reference/requantize.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"

 namespace tflite {
 namespace ops {
@ -29,20 +30,17 @@ namespace micro {
 namespace dequantize {

 struct OpData {
+  tflite::DequantizationParams quantization_params;
  // The scaling factor from input to output (aka the 'real multiplier') can
  // be represented as a fixed point multiplier plus a left shift.
  int32_t output_multiplier;
  int output_shift;
+  int32_t output_zero_point;
 };

 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
-  void* data = nullptr;
-  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
-      kTfLiteError) {
-    return nullptr;
-  }
-  return data;
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
 }

 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
@ -69,6 +67,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
    QuantizeMultiplier(effective_output_scale, &data->output_multiplier,
                       &data->output_shift);
  }
+
+  data->quantization_params.zero_point = input->params.zero_point;
+  data->quantization_params.scale = static_cast<double>(input->params.scale);
+  data->output_zero_point = output->params.zero_point;
  return kTfLiteOk;
 }

@ -76,28 +78,31 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
  TFLITE_DCHECK(node->user_data != nullptr);
  OpData* data = static_cast<OpData*>(node->user_data);

-  const TfLiteTensor* input = GetInput(context, node, 0);
-  TfLiteTensor* output = GetOutput(context, node, 0);
+  const TfLiteEvalTensor* input = tflite::micro::GetEvalInput(context, node, 0);
+  TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, 0);

  if (output->type == kTfLiteFloat32) {
-    tflite::DequantizationParams op_params;
-    op_params.zero_point = input->params.zero_point;
-    op_params.scale = static_cast<double>(input->params.scale);
    switch (input->type) {
      case kTfLiteUInt8:
-        reference_ops::Dequantize(
-            op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
-            GetTensorShape(output), GetTensorData<float>(output));
+        reference_ops::Dequantize(data->quantization_params,
+                                  tflite::micro::GetTensorShape(input),
+                                  tflite::micro::GetTensorData<uint8_t>(input),
+                                  tflite::micro::GetTensorShape(output),
+                                  tflite::micro::GetTensorData<float>(output));
        break;
      case kTfLiteInt8:
-        reference_ops::Dequantize(
-            op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
-            GetTensorShape(output), GetTensorData<float>(output));
+        reference_ops::Dequantize(data->quantization_params,
+                                  tflite::micro::GetTensorShape(input),
+                                  tflite::micro::GetTensorData<int8_t>(input),
+                                  tflite::micro::GetTensorShape(output),
+                                  tflite::micro::GetTensorData<float>(output));
        break;
      case kTfLiteInt16:
-        reference_ops::Dequantize(
-            op_params, GetTensorShape(input), GetTensorData<int16_t>(input),
-            GetTensorShape(output), GetTensorData<float>(output));
+        reference_ops::Dequantize(data->quantization_params,
+                                  tflite::micro::GetTensorShape(input),
+                                  tflite::micro::GetTensorData<int16_t>(input),
+                                  tflite::micro::GetTensorShape(output),
+                                  tflite::micro::GetTensorData<float>(output));
        break;
      default:
        TF_LITE_KERNEL_LOG(context, "Input %s, output %s not supported.",
@ -106,21 +111,23 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
        return kTfLiteError;
    }
  } else if (output->type == kTfLiteInt32) {
-    int flat_size =
-        MatchingFlatSize(GetTensorShape(input), GetTensorShape(output));
+    int flat_size = MatchingFlatSize(tflite::micro::GetTensorShape(input),
+                                     tflite::micro::GetTensorShape(output));
    switch (input->type) {
      case kTfLiteInt16: {
        reference_ops::Requantize(
-            GetTensorData<int16_t>(input), flat_size, data->output_multiplier,
-            data->output_shift, input->params.zero_point,
-            output->params.zero_point, GetTensorData<int32_t>(output));
+            tflite::micro::GetTensorData<int16_t>(input), flat_size,
+            data->output_multiplier, data->output_shift,
+            data->quantization_params.zero_point, data->output_zero_point,
+            tflite::micro::GetTensorData<int32_t>(output));
        break;
      }
      case kTfLiteInt8: {
        reference_ops::Requantize(
-            GetTensorData<int8_t>(input), flat_size, data->output_multiplier,
-            data->output_shift, input->params.zero_point,
-            output->params.zero_point, GetTensorData<int32_t>(output));
+            tflite::micro::GetTensorData<int8_t>(input), flat_size,
+            data->output_multiplier, data->output_shift,
+            data->quantization_params.zero_point, data->output_zero_point,
+            tflite::micro::GetTensorData<int32_t>(output));
        break;
      }
      default:
@ -141,16 +148,15 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {

 }  // namespace dequantize

-TfLiteRegistration* Register_DEQUANTIZE() {
-  static TfLiteRegistration r = {/*init=*/dequantize::Init,
-                                 /*free=*/nullptr,
-                                 /*prepare=*/dequantize::Prepare,
-                                 /*invoke=*/dequantize::Eval,
-                                 /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
-                                 /*version=*/0};
-  return &r;
+TfLiteRegistration Register_DEQUANTIZE() {
+  return {/*init=*/dequantize::Init,
+          /*free=*/nullptr,
+          /*prepare=*/dequantize::Prepare,
+          /*invoke=*/dequantize::Eval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
 }

 }  // namespace micro
--- a/libs/tensorflow/tensorflow/lite/micro/kernels/elementwise.cc
+++ b/libs/tensorflow/tensorflow/lite/micro/kernels/elementwise.cc
@ -18,6 +18,8 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/micro_utils.h"

 namespace tflite {
 namespace ops {
@ -40,7 +42,7 @@ TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteNode* node) {
  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
  const TfLiteTensor* input = GetInput(context, node, 0);
  TfLiteTensor* output = GetOutput(context, node, 0);
-  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
  if (!IsSupportedType(input->type)) {
    TF_LITE_KERNEL_LOG(context, "Input data type %s (%d) is not supported.",
                       TfLiteTypeGetName(input->type), input->type);
@ -52,13 +54,13 @@ TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteNode* node) {
 template <typename T>
 inline TfLiteStatus EvalImpl(TfLiteContext* context, TfLiteNode* node,
                             T func(T), TfLiteType expected_type) {
-  const TfLiteTensor* input = GetInput(context, node, 0);
-  TfLiteTensor* output = GetOutput(context, node, 0);
-  TF_LITE_ENSURE_EQ(context, input->type, expected_type);
-  const int64_t num_elements = NumElements(input);
-  const T* in_data = GetTensorData<T>(input);
-  T* out_data = GetTensorData<T>(output);
-  for (int64_t i = 0; i < num_elements; ++i) {
+  const TfLiteEvalTensor* input = tflite::micro::GetEvalInput(context, node, 0);
+  TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, 0);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, expected_type);
+  const size_t num_elements = ElementCount(*input->dims);
+  const T* in_data = tflite::micro::GetTensorData<T>(input);
+  T* out_data = tflite::micro::GetTensorData<T>(output);
+  for (size_t i = 0; i < num_elements; ++i) {
    out_data[i] = func(in_data[i]);
  }
  return kTfLiteOk;
@ -106,137 +108,103 @@ TfLiteStatus LogicalNotEval(TfLiteContext* context, TfLiteNode* node) {
  return EvalLogical(context, node, [](bool v) { return !v; });
 }

-TfLiteStatus TANHEval(TfLiteContext* context, TfLiteNode* node) {
-  return EvalNumeric(context, node, std::tanh);
-}
-
 }  // namespace
 }  // namespace elementwise

-TfLiteRegistration* Register_ABS() {
-  static TfLiteRegistration r = {
-      /*init=*/nullptr,
-      /*free=*/nullptr,
-      /*prepare=*/
-      elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
-      /*invoke=*/elementwise::AbsEval,
-      /*profiling_string=*/nullptr,
-      /*builtin_code=*/0,
-      /*custom_name=*/nullptr,
-      /*version=*/0};
-  return &r;
+TfLiteRegistration Register_ABS() {
+  return {/*init=*/nullptr,
+          /*free=*/nullptr,
+          /*prepare=*/
+          elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
+          /*invoke=*/elementwise::AbsEval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
 }

-TfLiteRegistration* Register_SIN() {
-  static TfLiteRegistration r = {
-      /*init=*/nullptr,
-      /*free=*/nullptr,
-      /*prepare=*/
-      elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
-      /*invoke=*/elementwise::SinEval,
-      /*profiling_string=*/nullptr,
-      /*builtin_code=*/0,
-      /*custom_name=*/nullptr,
-      /*version=*/0};
-  return &r;
+TfLiteRegistration Register_SIN() {
+  return {/*init=*/nullptr,
+          /*free=*/nullptr,
+          /*prepare=*/
+          elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
+          /*invoke=*/elementwise::SinEval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
 }

-TfLiteRegistration* Register_COS() {
-  static TfLiteRegistration r = {
-      /*init=*/nullptr,
-      /*free=*/nullptr,
-      /*prepare=*/
-      elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
-      /*invoke=*/elementwise::CosEval,
-      /*profiling_string=*/nullptr,
-      /*builtin_code=*/0,
-      /*custom_name=*/nullptr,
-      /*version=*/0};
-  return &r;
+TfLiteRegistration Register_COS() {
+  return {/*init=*/nullptr,
+          /*free=*/nullptr,
+          /*prepare=*/
+          elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
+          /*invoke=*/elementwise::CosEval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
 }

-TfLiteRegistration* Register_LOG() {
-  static TfLiteRegistration r = {
-      /*init=*/nullptr,
-      /*free=*/nullptr,
-      /*prepare=*/
-      elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
-      /*invoke=*/elementwise::LogEval,
-      /*profiling_string=*/nullptr,
-      /*builtin_code=*/0,
-      /*custom_name=*/nullptr,
-      /*version=*/0};
-  return &r;
+TfLiteRegistration Register_LOG() {
+  return {/*init=*/nullptr,
+          /*free=*/nullptr,
+          /*prepare=*/
+          elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
+          /*invoke=*/elementwise::LogEval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
 }

-TfLiteRegistration* Register_SQRT() {
-  static TfLiteRegistration r = {
-      /*init=*/nullptr,
-      /*free=*/nullptr,
-      /*prepare=*/
-      elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
-      /*invoke=*/elementwise::SqrtEval,
-      /*profiling_string=*/nullptr,
-      /*builtin_code=*/0,
-      /*custom_name=*/nullptr,
-      /*version=*/0};
-  return &r;
+TfLiteRegistration Register_SQRT() {
+  return {/*init=*/nullptr,
+          /*free=*/nullptr,
+          /*prepare=*/
+          elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
+          /*invoke=*/elementwise::SqrtEval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
 }

-TfLiteRegistration* Register_RSQRT() {
-  static TfLiteRegistration r = {
-      /*init=*/nullptr,
-      /*free=*/nullptr,
-      /*prepare=*/
-      elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
-      /*invoke=*/elementwise::RsqrtEval,
-      /*profiling_string=*/nullptr,
-      /*builtin_code=*/0,
-      /*custom_name=*/nullptr,
-      /*version=*/0};
-  return &r;
+TfLiteRegistration Register_RSQRT() {
+  return {/*init=*/nullptr,
+          /*free=*/nullptr,
+          /*prepare=*/
+          elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
+          /*invoke=*/elementwise::RsqrtEval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
 }

-TfLiteRegistration* Register_SQUARE() {
-  static TfLiteRegistration r = {
-      /*init=*/nullptr,
-      /*free=*/nullptr,
-      /*prepare=*/
-      elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
-      /*invoke=*/elementwise::SquareEval,
-      /*profiling_string=*/nullptr,
-      /*builtin_code=*/0,
-      /*custom_name=*/nullptr,
-      /*version=*/0};
-  return &r;
+TfLiteRegistration Register_SQUARE() {
+  return {/*init=*/nullptr,
+          /*free=*/nullptr,
+          /*prepare=*/
+          elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
+          /*invoke=*/elementwise::SquareEval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
 }

-TfLiteRegistration* Register_LOGICAL_NOT() {
-  static TfLiteRegistration r = {
-      /*init=*/nullptr,
-      /*free=*/nullptr,
-      /*prepare=*/
-      elementwise::GenericPrepare<elementwise::IsLogicalSupportedType>,
-      /*invoke=*/elementwise::LogicalNotEval,
-      /*profiling_string=*/nullptr,
-      /*builtin_code=*/0,
-      /*custom_name=*/nullptr,
-      /*version=*/0};
-  return &r;
-}
-
-TfLiteRegistration* Register_TANH() {
-  static TfLiteRegistration r = {
-      /*init=*/nullptr,
-      /*free=*/nullptr,
-      /*prepare=*/
-      elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
-      /*invoke=*/elementwise::TANHEval,
-      /*profiling_string=*/nullptr,
-      /*builtin_code=*/0,
-      /*custom_name=*/nullptr,
-      /*version=*/0};
-  return &r;
+TfLiteRegistration Register_LOGICAL_NOT() {
+  return {/*init=*/nullptr,
+          /*free=*/nullptr,
+          /*prepare=*/
+          elementwise::GenericPrepare<elementwise::IsLogicalSupportedType>,
+          /*invoke=*/elementwise::LogicalNotEval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
 }

 }  // namespace micro
--- a/libs/tensorflow/tensorflow/lite/micro/kernels/floor.cc
+++ b/libs/tensorflow/tensorflow/lite/micro/kernels/floor.cc
@ -17,7 +17,7 @@ limitations under the License.

 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
-#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"

 namespace tflite {
 namespace ops {
@ -28,25 +28,28 @@ constexpr int kInputTensor = 0;
 constexpr int kOutputTensor = 0;

 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  reference_ops::Floor(GetTensorShape(input), GetTensorData<float>(input),
-                       GetTensorShape(output), GetTensorData<float>(output));
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteFloat32);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+  reference_ops::Floor(tflite::micro::GetTensorShape(input),
+                       tflite::micro::GetTensorData<float>(input),
+                       tflite::micro::GetTensorShape(output),
+                       tflite::micro::GetTensorData<float>(output));
  return kTfLiteOk;
 }
 }  // namespace floor

-TfLiteRegistration* Register_FLOOR() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
-                                 /*free=*/nullptr,
-                                 /*prepare=*/nullptr,
-                                 /*invoke=*/floor::Eval,
-                                 /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
-                                 /*version=*/0};
-  return &r;
+TfLiteRegistration Register_FLOOR() {
+  return {/*init=*/nullptr,
+          /*free=*/nullptr,
+          /*prepare=*/nullptr,
+          /*invoke=*/floor::Eval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
 }

 }  // namespace micro
--- a/libs/tensorflow/tensorflow/lite/micro/kernels/fully_connected.cc
+++ b/libs/tensorflow/tensorflow/lite/micro/kernels/fully_connected.cc
@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"

 namespace tflite {
 namespace ops {
@ -40,6 +41,10 @@ struct OpData {
  int32_t output_activation_max;
  // The index of the temporary tensor where the quantized inputs are cached.
  int input_quantized_index;
+  // Cached zero point values of tensors.
+  int32_t input_zero_point;
+  int32_t filter_zero_point;
+  int32_t output_zero_point;
 };

 constexpr int kInputTensor = 0;
@ -64,6 +69,10 @@ TfLiteStatus CalculateOpData(TfLiteContext* context,
    TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
        context, activation, output, &data->output_activation_min,
        &data->output_activation_max));
+
+    data->input_zero_point = input->params.zero_point;
+    data->filter_zero_point = filter->params.zero_point;
+    data->output_zero_point = output->params.zero_point;
  }
  return status;
 }
@ -72,12 +81,7 @@ TfLiteStatus CalculateOpData(TfLiteContext* context,

 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
-  void* data = nullptr;
-  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
-      kTfLiteError) {
-    return nullptr;
-  }
-  return data;
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
 }

 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
@ -93,7 +97,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);

-  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
  TF_LITE_ENSURE_MSG(context, input->type == filter->type,
                     "Hybrid models are not supported on TFLite Micro.");

@ -102,13 +106,15 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 }

 TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
-                               const OpData& data, const TfLiteTensor* input,
-                               const TfLiteTensor* filter,
-                               const TfLiteTensor* bias, TfLiteTensor* output) {
+                               const OpData& data,
+                               const TfLiteEvalTensor* input,
+                               const TfLiteEvalTensor* filter,
+                               const TfLiteEvalTensor* bias,
+                               TfLiteEvalTensor* output) {
  tflite::FullyConnectedParams op_params;
-  op_params.input_offset = -input->params.zero_point;
-  op_params.weights_offset = -filter->params.zero_point;
-  op_params.output_offset = output->params.zero_point;
+  op_params.input_offset = -data.input_zero_point;
+  op_params.weights_offset = -data.filter_zero_point;
+  op_params.output_offset = data.output_zero_point;
  op_params.output_multiplier = data.output_multiplier;
  // TODO(b/138810107): Figure out whether output shift should be inverted
  op_params.output_shift = -data.output_shift;
@ -116,20 +122,25 @@ TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
  op_params.quantized_activation_max = data.output_activation_max;

  reference_integer_ops::FullyConnected(
-      op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
-      GetTensorShape(filter), GetTensorData<int8_t>(filter),
-      GetTensorShape(bias), GetTensorData<int32_t>(bias),
-      GetTensorShape(output), GetTensorData<int8_t>(output));
+      op_params, tflite::micro::GetTensorShape(input),
+      tflite::micro::GetTensorData<int8_t>(input),
+      tflite::micro::GetTensorShape(filter),
+      tflite::micro::GetTensorData<int8_t>(filter),
+      tflite::micro::GetTensorShape(bias),
+      tflite::micro::GetTensorData<int32_t>(bias),
+      tflite::micro::GetTensorShape(output),
+      tflite::micro::GetTensorData<int8_t>(output));
  return kTfLiteOk;
 }

 TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                           const OpData& data, const TfLiteTensor* input,
-                           const TfLiteTensor* filter, const TfLiteTensor* bias,
-                           TfLiteTensor* output) {
-  const int32_t input_offset = -input->params.zero_point;
-  const int32_t filter_offset = -filter->params.zero_point;
-  const int32_t output_offset = output->params.zero_point;
+                           const OpData& data, const TfLiteEvalTensor* input,
+                           const TfLiteEvalTensor* filter,
+                           const TfLiteEvalTensor* bias,
+                           TfLiteEvalTensor* output) {
+  const int32_t input_offset = -data.input_zero_point;
+  const int32_t filter_offset = -data.filter_zero_point;
+  const int32_t output_offset = data.output_zero_point;

  tflite::FullyConnectedParams op_params;
  op_params.input_offset = input_offset;
@ -141,12 +152,16 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
  op_params.quantized_activation_min = data.output_activation_min;
  op_params.quantized_activation_max = data.output_activation_max;

-#define TF_LITE_FULLY_CONNECTED(output_data_type)                      \
-  reference_ops::FullyConnected(                                       \
-      op_params, GetTensorShape(input), GetTensorData<uint8_t>(input), \
-      GetTensorShape(filter), GetTensorData<uint8_t>(filter),          \
-      GetTensorShape(bias), GetTensorData<int32_t>(bias),              \
-      GetTensorShape(output), GetTensorData<output_data_type>(output))
+#define TF_LITE_FULLY_CONNECTED(output_data_type)      \
+  reference_ops::FullyConnected(                       \
+      op_params, tflite::micro::GetTensorShape(input), \
+      tflite::micro::GetTensorData<uint8_t>(input),    \
+      tflite::micro::GetTensorShape(filter),           \
+      tflite::micro::GetTensorData<uint8_t>(filter),   \
+      tflite::micro::GetTensorShape(bias),             \
+      tflite::micro::GetTensorData<int32_t>(bias),     \
+      tflite::micro::GetTensorShape(output),           \
+      tflite::micro::GetTensorData<output_data_type>(output))
  switch (output->type) {
    case kTfLiteUInt8:
      TF_LITE_FULLY_CONNECTED(uint8_t);
@ -165,8 +180,9 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,

 TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
                       TfLiteFusedActivation activation,
-                       const TfLiteTensor* input, const TfLiteTensor* filter,
-                       const TfLiteTensor* bias, TfLiteTensor* output) {
+                       const TfLiteEvalTensor* input,
+                       const TfLiteEvalTensor* filter,
+                       const TfLiteEvalTensor* bias, TfLiteEvalTensor* output) {
  float output_activation_min, output_activation_max;
  CalculateActivationRange(activation, &output_activation_min,
                           &output_activation_max);
@ -174,10 +190,14 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
  op_params.float_activation_min = output_activation_min;
  op_params.float_activation_max = output_activation_max;
  tflite::reference_ops::FullyConnected(
-      op_params, GetTensorShape(input), GetTensorData<float>(input),
-      GetTensorShape(filter), GetTensorData<float>(filter),
-      GetTensorShape(bias), GetTensorData<float>(bias), GetTensorShape(output),
-      GetTensorData<float>(output));
+      op_params, tflite::micro::GetTensorShape(input),
+      tflite::micro::GetTensorData<float>(input),
+      tflite::micro::GetTensorShape(filter),
+      tflite::micro::GetTensorData<float>(filter),
+      tflite::micro::GetTensorShape(bias),
+      tflite::micro::GetTensorData<float>(bias),
+      tflite::micro::GetTensorShape(output),
+      tflite::micro::GetTensorData<float>(output));
  return kTfLiteOk;
 }

@ -186,10 +206,14 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
  const auto* params =
      static_cast<const TfLiteFullyConnectedParams*>(node->builtin_data);

-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
-  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  const TfLiteEvalTensor* filter =
+      tflite::micro::GetEvalInput(context, node, kWeightsTensor);
+  const TfLiteEvalTensor* bias =
+      tflite::micro::GetEvalInput(context, node, kBiasTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);

  TFLITE_DCHECK(node->user_data != nullptr);
  const OpData& data = *(static_cast<const OpData*>(node->user_data));
@ -216,16 +240,15 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {

 }  // namespace fully_connected

-TfLiteRegistration* Register_FULLY_CONNECTED() {
-  static TfLiteRegistration r = {/*init=*/fully_connected::Init,
-                                 /*free=*/nullptr,
-                                 /*prepare=*/fully_connected::Prepare,
-                                 /*invoke=*/fully_connected::Eval,
-                                 /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
-                                 /*version=*/0};
-  return &r;
+TfLiteRegistration Register_FULLY_CONNECTED() {
+  return {/*init=*/fully_connected::Init,
+          /*free=*/nullptr,
+          /*prepare=*/fully_connected::Prepare,
+          /*invoke=*/fully_connected::Eval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
 }

 }  // namespace micro
--- a/libs/tensorflow/tensorflow/lite/micro/kernels/l2norm.cc
+++ b/libs/tensorflow/tensorflow/lite/micro/kernels/l2norm.cc
@ -14,16 +14,19 @@ limitations under the License.
 ==============================================================================*/

 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/portable_tensor.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/l2normalization.h"
 #include "tensorflow/lite/kernels/internal/reference/l2normalization.h"
-#include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"

 namespace tflite {
 namespace ops {
 namespace micro {
 namespace l2norm {

+namespace {
+
 // This file has two implementation of L2Norm.
 enum KernelType {
  kReference,
@ -33,9 +36,15 @@ enum KernelType {
 constexpr int kInputTensor = 0;
 constexpr int kOutputTensor = 0;

+}  // namespace
+
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-#if defined(DEBUG)
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
  auto* params = reinterpret_cast<TfLiteL2NormParams*>(node->builtin_data);
+  L2NormalizationParams* data =
+      static_cast<L2NormalizationParams*>(node->user_data);

  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
@ -48,29 +57,36 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  TF_LITE_ENSURE(context, output->type == kTfLiteFloat32 ||
                              output->type == kTfLiteUInt8 ||
                              output->type == kTfLiteInt8);
-  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);

  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
-    TF_LITE_ENSURE_EQ(context, output->params.scale, (1. / 128.));
-    if (output->type == kTfLiteUInt8) {
-      TF_LITE_ENSURE_EQ(context, output->params.zero_point, 128);
-    }
-    if (output->type == kTfLiteInt8) {
-      TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
-    }
+    data->input_zero_point = input->params.zero_point;
+  } else if (output->type == kTfLiteFloat32) {
+    data->input_zero_point = 0;
  }

  // TODO(ahentz): For some reason our implementations don't support
  // activations.
  TF_LITE_ENSURE_EQ(context, params->activation, kTfLiteActNone);
-#endif

  return kTfLiteOk;
 }

+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context,
+                                           sizeof(L2NormalizationParams));
+}
+
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const L2NormalizationParams& data =
+      *(static_cast<const L2NormalizationParams*>(node->user_data));
+
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);

  // TODO(b/143912164): instead of hardcode the epsilon here, we should read it
  // from tensorflow, i.e., adding a params.
@ -87,39 +103,32 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
  // So we don't even need to do handle the epsilon for quantized kernel case.
  const float epsilon = 1e-6f;
  if (output->type == kTfLiteFloat32) {
-#define TF_LITE_L2NORM(type)                                                 \
-  tflite::L2NormalizationParams op_params;                                   \
-  op_params.input_zero_point = 0;                                            \
-  type::L2Normalization(op_params, GetTensorShape(input),                    \
-                        GetTensorData<float>(input), GetTensorShape(output), \
-                        GetTensorData<float>(output), epsilon)
-
-    TF_LITE_L2NORM(reference_ops);
-#undef TF_LITE_L2NORM
+    reference_ops::L2Normalization(data, tflite::micro::GetTensorShape(input),
+                                   tflite::micro::GetTensorData<float>(input),
+                                   tflite::micro::GetTensorShape(output),
+                                   tflite::micro::GetTensorData<float>(output),
+                                   epsilon);
  } else if (output->type == kTfLiteUInt8) {
-#define TF_LITE_L2NORM(type)                                                 \
-  tflite::L2NormalizationParams op_params;                                   \
-  op_params.input_zero_point = input->params.zero_point;                     \
-  type::L2Normalization(op_params, GetTensorShape(input),                    \
-                        GetTensorData<uint8>(input), GetTensorShape(output), \
-                        GetTensorData<uint8>(output))
-
-    TF_LITE_L2NORM(reference_ops);
-#undef TF_LITE_L2NORM
+    reference_ops::L2Normalization(
+        data, tflite::micro::GetTensorShape(input),
+        tflite::micro::GetTensorData<uint8_t>(input),
+        tflite::micro::GetTensorShape(output),
+        tflite::micro::GetTensorData<uint8_t>(output));
  } else if (output->type == kTfLiteInt8) {
-    const auto input_shape = GetTensorShape(input);
-    const auto output_shape = GetTensorShape(output);
+    const auto input_shape = tflite::micro::GetTensorShape(input);
+    const auto output_shape = tflite::micro::GetTensorShape(output);
    const int trailing_dim = input_shape.DimensionsCount() - 1;
    const int depth =
        MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
    const int outer_size =
        MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
-    reference_integer_ops::L2Normalization(input->params.zero_point, outer_size,
-                                           depth, GetTensorData<int8>(input),
-                                           GetTensorData<int8>(output));
+    reference_integer_ops::L2Normalization(
+        data.input_zero_point, outer_size, depth,
+        tflite::micro::GetTensorData<int8_t>(input),
+        tflite::micro::GetTensorData<int8_t>(output));
  } else {
-    TF_LITE_KERNEL_LOG(context, "Output type is %d, requires float.",
-                         output->type);
+    TF_LITE_KERNEL_LOG(context, "Output type is %s, requires float.",
+                       TfLiteTypeGetName(output->type));
    return kTfLiteError;
  }

@ -128,22 +137,18 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {

 }  // namespace l2norm

-TfLiteRegistration* Register_L2NORM_REF() {
-    static TfLiteRegistration r = {/*init=*/nullptr,
-                                 /*free=*/nullptr,
-                                 /*prepare=*/l2norm::Prepare,
-                                 /*invoke=*/l2norm::Eval,
-                                 /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
-                                 /*version=*/0};
-
-  return &r;
+TfLiteRegistration Register_L2NORM_REF() {
+  return {/*init=*/l2norm::Init,
+          /*free=*/nullptr,
+          /*prepare=*/l2norm::Prepare,
+          /*invoke=*/l2norm::Eval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
 }

-TfLiteRegistration* Register_L2_NORMALIZATION() {
-  return Register_L2NORM_REF();
-}
+TfLiteRegistration Register_L2_NORMALIZATION() { return Register_L2NORM_REF(); }

 }  // namespace micro
 }  // namespace ops
--- a/libs/tensorflow/tensorflow/lite/micro/kernels/logical.cc
+++ b/libs/tensorflow/tensorflow/lite/micro/kernels/logical.cc
@ -15,8 +15,8 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/reference/binary_function.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
-#include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"

 namespace tflite {
 namespace ops {
@ -31,20 +31,29 @@ constexpr int kOutputTensor = 0;

 TfLiteStatus LogicalImpl(TfLiteContext* context, TfLiteNode* node,
                         bool (*func)(bool, bool)) {
-  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteEvalTensor* input1 =
+      tflite::micro::GetEvalInput(context, node, kInputTensor1);
+  const TfLiteEvalTensor* input2 =
+      tflite::micro::GetEvalInput(context, node, kInputTensor2);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);

-  if (HaveSameShapes(input1, input2)) {
+  if (tflite::micro::HaveSameShapes(input1, input2)) {
    reference_ops::BinaryFunction<bool, bool, bool>(
-        GetTensorShape(input1), GetTensorData<bool>(input1),
-        GetTensorShape(input2), GetTensorData<bool>(input2),
-        GetTensorShape(output), GetTensorData<bool>(output), func);
+        tflite::micro::GetTensorShape(input1),
+        tflite::micro::GetTensorData<bool>(input1),
+        tflite::micro::GetTensorShape(input2),
+        tflite::micro::GetTensorData<bool>(input2),
+        tflite::micro::GetTensorShape(output),
+        tflite::micro::GetTensorData<bool>(output), func);
  } else {
    reference_ops::BroadcastBinaryFunction4DSlow<bool, bool, bool>(
-        GetTensorShape(input1), GetTensorData<bool>(input1),
-        GetTensorShape(input2), GetTensorData<bool>(input2),
-        GetTensorShape(output), GetTensorData<bool>(output), func);
+        tflite::micro::GetTensorShape(input1),
+        tflite::micro::GetTensorData<bool>(input1),
+        tflite::micro::GetTensorShape(input2),
+        tflite::micro::GetTensorData<bool>(input2),
+        tflite::micro::GetTensorShape(output),
+        tflite::micro::GetTensorData<bool>(output), func);
  }

  return kTfLiteOk;
@ -65,32 +74,30 @@ TfLiteStatus LogicalAndEval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace
 }  // namespace logical

-TfLiteRegistration* Register_LOGICAL_OR() {
+TfLiteRegistration Register_LOGICAL_OR() {
  // Init, Free, Prepare, Eval are satisfying the Interface required by
  // TfLiteRegistration.
-  static TfLiteRegistration r = {/*init=*/nullptr,
-                                 /*free=*/nullptr,
-                                 /*prepare=*/nullptr,
-                                 /*invoke=*/logical::LogicalOrEval,
-                                 /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
-                                 /*version=*/0};
-  return &r;
+  return {/*init=*/nullptr,
+          /*free=*/nullptr,
+          /*prepare=*/nullptr,
+          /*invoke=*/logical::LogicalOrEval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
 }

-TfLiteRegistration* Register_LOGICAL_AND() {
+TfLiteRegistration Register_LOGICAL_AND() {
  // Init, Free, Prepare, Eval are satisfying the Interface required by
  // TfLiteRegistration.
-  static TfLiteRegistration r = {/*init=*/nullptr,
-                                 /*free=*/nullptr,
-                                 /*prepare=*/nullptr,
-                                 /*invoke=*/logical::LogicalAndEval,
-                                 /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
-                                 /*version=*/0};
-  return &r;
+  return {/*init=*/nullptr,
+          /*free=*/nullptr,
+          /*prepare=*/nullptr,
+          /*invoke=*/logical::LogicalAndEval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
 }

 }  // namespace micro
--- a/libs/tensorflow/tensorflow/lite/micro/kernels/logistic.cc
+++ b/libs/tensorflow/tensorflow/lite/micro/kernels/logistic.cc
@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"

 namespace tflite {
 namespace ops {
@ -44,7 +45,7 @@ TfLiteStatus CalculateArithmeticOpData(TfLiteContext* context, TfLiteNode* node,
  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);

-  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
  if (input->type == kTfLiteInt8) {
    TF_LITE_ENSURE_EQ(context, output->params.zero_point,
                      std::numeric_limits<int8_t>::min());
@ -54,6 +55,8 @@ TfLiteStatus CalculateArithmeticOpData(TfLiteContext* context, TfLiteNode* node,
        static_cast<double>(input->params.scale) *
        static_cast<double>(1 << (31 - kInputIntegerBits));

+    data->input_zero_point = input->params.zero_point;
+
    const double q = std::frexp(input_real_multiplier, &data->input_left_shift);
    data->input_multiplier = static_cast<int32_t>(TfLiteRound(q * (1ll << 31)));

@ -64,18 +67,34 @@ TfLiteStatus CalculateArithmeticOpData(TfLiteContext* context, TfLiteNode* node,
 }
 }  // namespace

+void* LogisticInit(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
+}
+
+TfLiteStatus LogisticPrepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  OpData* data = static_cast<OpData*>(node->user_data);
+
+  return CalculateArithmeticOpData(context, node, data);
+}
+
 TfLiteStatus LogisticEval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  OpData data;
-  CalculateArithmeticOpData(context, node, &data);
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+
+  TFLITE_DCHECK(node->user_data != nullptr);
+  OpData* data = static_cast<OpData*>(node->user_data);

  if (input->type == kTfLiteFloat32) {
    switch (output->type) {
      case kTfLiteFloat32: {
-        reference_ops::Logistic(
-            GetTensorShape(input), GetTensorData<float>(input),
-            GetTensorShape(output), GetTensorData<float>(output));
+        reference_ops::Logistic(tflite::micro::GetTensorShape(input),
+                                tflite::micro::GetTensorData<float>(input),
+                                tflite::micro::GetTensorShape(output),
+                                tflite::micro::GetTensorData<float>(output));
        return kTfLiteOk;
      }
      default:
@ -88,10 +107,11 @@ TfLiteStatus LogisticEval(TfLiteContext* context, TfLiteNode* node) {
    switch (output->type) {
      case kTfLiteInt8: {
        reference_integer_ops::Logistic(
-            input->params.zero_point, data.input_range_radius,
-            data.input_multiplier, data.input_left_shift,
-            NumElements(input->dims), GetTensorData<int8_t>(input),
-            GetTensorData<int8_t>(output));
+            data->input_zero_point, data->input_range_radius,
+            data->input_multiplier, data->input_left_shift,
+            NumElements(input->dims),
+            tflite::micro::GetTensorData<int8_t>(input),
+            tflite::micro::GetTensorData<int8_t>(output));
        return kTfLiteOk;
      }
      default:
@ -113,16 +133,15 @@ TfLiteStatus LogisticEval(TfLiteContext* context, TfLiteNode* node) {

 }  // namespace activations

-TfLiteRegistration* Register_LOGISTIC() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
-                                 /*free=*/nullptr,
-                                 /*prepare=*/nullptr,
-                                 /*invoke=*/activations::LogisticEval,
-                                 /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
-                                 /*version=*/0};
-  return &r;
+TfLiteRegistration Register_LOGISTIC() {
+  return {/*init=*/activations::LogisticInit,
+          /*free=*/nullptr,
+          /*prepare=*/activations::LogisticPrepare,
+          /*invoke=*/activations::LogisticEval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
 }
 }  // namespace micro
 }  // namespace ops
--- a/libs/tensorflow/tensorflow/lite/micro/kernels/maximum_minimum.cc
+++ b/libs/tensorflow/tensorflow/lite/micro/kernels/maximum_minimum.cc
@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"

 namespace tflite {
 namespace ops {
@ -40,13 +41,13 @@ constexpr int kOutputTensor = 0;

 struct OpContext {
  OpContext(TfLiteContext* context, TfLiteNode* node) {
-    input1 = GetInput(context, node, kInputTensor1);
-    input2 = GetInput(context, node, kInputTensor2);
-    output = GetOutput(context, node, kOutputTensor);
+    input1 = tflite::micro::GetEvalInput(context, node, kInputTensor1);
+    input2 = tflite::micro::GetEvalInput(context, node, kInputTensor2);
+    output = tflite::micro::GetEvalOutput(context, node, kOutputTensor);
  }
-  const TfLiteTensor* input1;
-  const TfLiteTensor* input2;
-  TfLiteTensor* output;
+  const TfLiteEvalTensor* input1;
+  const TfLiteEvalTensor* input2;
+  TfLiteEvalTensor* output;
 };

 struct MaximumOp {
@ -69,12 +70,12 @@ template <typename data_type, typename op_type>
 void TFLiteOperation(TfLiteContext* context, TfLiteNode* node,
                     const OpContext& op_context) {
  reference_ops::MaximumMinimumBroadcastSlow(
-      GetTensorShape(op_context.input1),
-      GetTensorData<data_type>(op_context.input1),
-      GetTensorShape(op_context.input2),
-      GetTensorData<data_type>(op_context.input2),
-      GetTensorShape(op_context.output),
-      GetTensorData<data_type>(op_context.output),
+      tflite::micro::GetTensorShape(op_context.input1),
+      tflite::micro::GetTensorData<data_type>(op_context.input1),
+      tflite::micro::GetTensorShape(op_context.input2),
+      tflite::micro::GetTensorData<data_type>(op_context.input2),
+      tflite::micro::GetTensorShape(op_context.output),
+      tflite::micro::GetTensorData<data_type>(op_context.output),
      op_type::template op<data_type>);
 }

@ -116,34 +117,30 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {

 }  // namespace maximum_minimum

-TfLiteRegistration* Register_MAXIMUM() {
-  static TfLiteRegistration r = {
-      /*init=*/nullptr,
-      /*free=*/nullptr,
-      /*prepare=*/nullptr,
-      /*invoke=*/
-      maximum_minimum::Eval<maximum_minimum::kReference,
-                            maximum_minimum::MaximumOp>,
-      /*profiling_string=*/nullptr,
-      /*builtin_code=*/0,
-      /*custom_name=*/nullptr,
-      /*version=*/0};
-  return &r;
+TfLiteRegistration Register_MAXIMUM() {
+  return {/*init=*/nullptr,
+          /*free=*/nullptr,
+          /*prepare=*/nullptr,
+          /*invoke=*/
+          maximum_minimum::Eval<maximum_minimum::kReference,
+                                maximum_minimum::MaximumOp>,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
 }

-TfLiteRegistration* Register_MINIMUM() {
-  static TfLiteRegistration r = {
-      /*init=*/nullptr,
-      /*free=*/nullptr,
-      /*prepare=*/nullptr,
-      /*invoke=*/
-      maximum_minimum::Eval<maximum_minimum::kReference,
-                            maximum_minimum::MinimumOp>,
-      /*profiling_string=*/nullptr,
-      /*builtin_code=*/0,
-      /*custom_name=*/nullptr,
-      /*version=*/0};
-  return &r;
+TfLiteRegistration Register_MINIMUM() {
+  return {/*init=*/nullptr,
+          /*free=*/nullptr,
+          /*prepare=*/nullptr,
+          /*invoke=*/
+          maximum_minimum::Eval<maximum_minimum::kReference,
+                                maximum_minimum::MinimumOp>,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
 }

 }  // namespace micro
--- a/libs/tensorflow/tensorflow/lite/micro/kernels/micro_ops.h
+++ b/libs/tensorflow/tensorflow/lite/micro/kernels/micro_ops.h
@ -29,59 +29,61 @@ namespace micro {
 // their model requires, using a custom `(Micro)MutableOpResolver`. Selective
 // registration in turn allows the linker to strip unused kernels.

-TfLiteRegistration* Register_ABS();
-TfLiteRegistration* Register_ADD();
-TfLiteRegistration* Register_ARG_MAX();
-TfLiteRegistration* Register_ARG_MIN();
-TfLiteRegistration* Register_AVERAGE_POOL_2D();
-TfLiteRegistration* Register_CEIL();
+TfLiteRegistration Register_ABS();
+TfLiteRegistration Register_ADD();
+TfLiteRegistration Register_ARG_MAX();
+TfLiteRegistration Register_ARG_MIN();
+TfLiteRegistration Register_AVERAGE_POOL_2D();
+TfLiteRegistration Register_CEIL();
+// TODO(b/160234179): Change custom OPs to also return by value.
 TfLiteRegistration* Register_CIRCULAR_BUFFER();
-TfLiteRegistration* Register_CONV_2D();
-TfLiteRegistration* Register_CONCATENATION();
-TfLiteRegistration* Register_COS();
-TfLiteRegistration* Register_DEPTHWISE_CONV_2D();
-TfLiteRegistration* Register_DEQUANTIZE();
-TfLiteRegistration* Register_EQUAL();
-TfLiteRegistration* Register_FLOOR();
-TfLiteRegistration* Register_FULLY_CONNECTED();
-TfLiteRegistration* Register_GREATER();
-TfLiteRegistration* Register_GREATER_EQUAL();
-TfLiteRegistration* Register_LESS();
-TfLiteRegistration* Register_LESS_EQUAL();
-TfLiteRegistration* Register_LOG();
-TfLiteRegistration* Register_LOGICAL_AND();
-TfLiteRegistration* Register_LOGICAL_NOT();
-TfLiteRegistration* Register_LOGICAL_OR();
-TfLiteRegistration* Register_LOGISTIC();
-TfLiteRegistration* Register_MAXIMUM();
-TfLiteRegistration* Register_MAX_POOL_2D();
-TfLiteRegistration* Register_MEAN();
-TfLiteRegistration* Register_MINIMUM();
-TfLiteRegistration* Register_MUL();
-TfLiteRegistration* Register_NEG();
-TfLiteRegistration* Register_NOT_EQUAL();
-TfLiteRegistration* Register_PACK();
-TfLiteRegistration* Register_PAD();
-TfLiteRegistration* Register_PADV2();
-TfLiteRegistration* Register_PRELU();
-TfLiteRegistration* Register_QUANTIZE();
-TfLiteRegistration* Register_RELU();
-TfLiteRegistration* Register_RELU6();
-TfLiteRegistration* Register_RESHAPE();
-TfLiteRegistration* Register_RESIZE_NEAREST_NEIGHBOR();
-TfLiteRegistration* Register_ROUND();
-TfLiteRegistration* Register_RSQRT();
-TfLiteRegistration* Register_SIN();
-TfLiteRegistration* Register_SOFTMAX();
-TfLiteRegistration* Register_SPLIT();
-TfLiteRegistration* Register_SQRT();
-TfLiteRegistration* Register_SQUARE();
-TfLiteRegistration* Register_STRIDED_SLICE();
-TfLiteRegistration* Register_SUB();
-TfLiteRegistration* Register_SVDF();
-TfLiteRegistration* Register_UNPACK();
-TfLiteRegistration* Register_L2_NORMALIZATION();
-TfLiteRegistration* Register_TANH();
+TfLiteRegistration Register_CONV_2D();
+TfLiteRegistration Register_CONCATENATION();
+TfLiteRegistration Register_COS();
+TfLiteRegistration Register_DEPTHWISE_CONV_2D();
+TfLiteRegistration Register_DEQUANTIZE();
+TfLiteRegistration Register_EQUAL();
+TfLiteRegistration Register_FLOOR();
+TfLiteRegistration Register_FULLY_CONNECTED();
+TfLiteRegistration Register_GREATER();
+TfLiteRegistration Register_GREATER_EQUAL();
+TfLiteRegistration Register_HARD_SWISH();
+TfLiteRegistration Register_LESS();
+TfLiteRegistration Register_LESS_EQUAL();
+TfLiteRegistration Register_LOG();
+TfLiteRegistration Register_LOGICAL_AND();
+TfLiteRegistration Register_LOGICAL_NOT();
+TfLiteRegistration Register_LOGICAL_OR();
+TfLiteRegistration Register_LOGISTIC();
+TfLiteRegistration Register_MAXIMUM();
+TfLiteRegistration Register_MAX_POOL_2D();
+TfLiteRegistration Register_MEAN();
+TfLiteRegistration Register_MINIMUM();
+TfLiteRegistration Register_MUL();
+TfLiteRegistration Register_NEG();
+TfLiteRegistration Register_NOT_EQUAL();
+TfLiteRegistration Register_PACK();
+TfLiteRegistration Register_PAD();
+TfLiteRegistration Register_PADV2();
+TfLiteRegistration Register_PRELU();
+TfLiteRegistration Register_QUANTIZE();
+TfLiteRegistration Register_RELU();
+TfLiteRegistration Register_RELU6();
+TfLiteRegistration Register_RESHAPE();
+TfLiteRegistration Register_RESIZE_NEAREST_NEIGHBOR();
+TfLiteRegistration Register_ROUND();
+TfLiteRegistration Register_RSQRT();
+TfLiteRegistration Register_SIN();
+TfLiteRegistration Register_SOFTMAX();
+TfLiteRegistration Register_SPLIT();
+TfLiteRegistration Register_SQRT();
+TfLiteRegistration Register_SQUARE();
+TfLiteRegistration Register_STRIDED_SLICE();
+TfLiteRegistration Register_SUB();
+TfLiteRegistration Register_SVDF();
+TfLiteRegistration Register_UNPACK();
+TfLiteRegistration Register_L2_NORMALIZATION();
+TfLiteRegistration Register_TANH();

 }  // namespace micro
 }  // namespace ops
--- a/libs/tensorflow/tensorflow/lite/micro/kernels/mul.cc
+++ b/libs/tensorflow/tensorflow/lite/micro/kernels/mul.cc
@ -21,22 +21,31 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/memory_helpers.h"

 namespace tflite {
 namespace ops {
 namespace micro {
 namespace mul {
+namespace {

 constexpr int kInput1Tensor = 0;
 constexpr int kInput2Tensor = 1;
 constexpr int kOutputTensor = 0;

 struct OpData {
+  int32_t input1_zero_point;
+  int32_t input2_zero_point;
+
  int32_t output_activation_min;
  int32_t output_activation_max;
-
+  int32_t output_zero_point;
  int32_t output_multiplier;
  int output_shift;
+
+  float output_activation_min_f32;
+  float output_activation_max_f32;
 };

 TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
@ -48,105 +57,155 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);

-  TF_LITE_ENSURE_EQ(context, input1->type, input2->type);
-
-  TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
-      context, params->activation, output, &data->output_activation_min,
-      &data->output_activation_max));
+  TF_LITE_ENSURE_TYPES_EQ(context, input1->type, input2->type);

  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
+    TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
+        context, params->activation, output, &data->output_activation_min,
+        &data->output_activation_max));
+
    double real_multiplier = static_cast<double>(input1->params.scale) *
                             static_cast<double>(input2->params.scale) /
                             static_cast<double>(output->params.scale);
    QuantizeMultiplier(real_multiplier, &data->output_multiplier,
                       &data->output_shift);
+
+    data->input1_zero_point = input1->params.zero_point;
+    data->input2_zero_point = input2->params.zero_point;
+    data->output_zero_point = output->params.zero_point;
+  } else {
+    CalculateActivationRange(params->activation,
+                             &data->output_activation_min_f32,
+                             &data->output_activation_max_f32);
  }

  return kTfLiteOk;
 }

-void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                   TfLiteMulParams* params, OpData* data,
-                   const TfLiteTensor* input1, const TfLiteTensor* input2,
-                   TfLiteTensor* output) {
-  if (output->type == kTfLiteInt8 || output->type == kTfLiteUInt8) {
-    tflite::ArithmeticParams op_params;
-    SetActivationParams(data->output_activation_min,
-                        data->output_activation_max, &op_params);
-    op_params.input1_offset = -input1->params.zero_point;
-    op_params.input2_offset = -input2->params.zero_point;
-    op_params.output_offset = output->params.zero_point;
-    op_params.output_multiplier = data->output_multiplier;
-    op_params.output_shift = data->output_shift;
-    bool need_broadcast = reference_ops::ProcessBroadcastShapes(
-        GetTensorShape(input1), GetTensorShape(input2), &op_params);
+}  // namespace

-#define TF_LITE_MUL(type, opname, dtype)                             \
-  type::opname(op_params, GetTensorShape(input1),                    \
-               GetTensorData<dtype>(input1), GetTensorShape(input2), \
-               GetTensorData<dtype>(input2), GetTensorShape(output), \
-               GetTensorData<dtype>(output));
+void EvalQuantized(TfLiteContext* context, TfLiteNode* node, const OpData* data,
+                   const TfLiteEvalTensor* input1,
+                   const TfLiteEvalTensor* input2, TfLiteEvalTensor* output) {
+  tflite::ArithmeticParams op_params = {};
+  op_params.quantized_activation_min = data->output_activation_min;
+  op_params.quantized_activation_max = data->output_activation_max;
+  op_params.float_activation_max = data->output_activation_max_f32;
+  op_params.input1_offset = -data->input1_zero_point;
+  op_params.input2_offset = -data->input2_zero_point;
+  op_params.output_offset = data->output_zero_point;
+  op_params.output_multiplier = data->output_multiplier;
+  op_params.output_shift = data->output_shift;

-    if (output->type == kTfLiteInt8) {
-      if (need_broadcast) {
-        TF_LITE_MUL(reference_integer_ops, BroadcastMul4DSlow, int8_t);
-      } else {
-        TF_LITE_MUL(reference_integer_ops, Mul, int8_t);
-      }
-    } else if (output->type == kTfLiteUInt8) {
-      if (need_broadcast) {
-        TF_LITE_MUL(reference_ops, BroadcastMul4DSlow, uint8_t);
-      } else {
-        TF_LITE_MUL(reference_ops, Mul, uint8_t);
-      }
+  bool need_broadcast = reference_ops::ProcessBroadcastShapes(
+      tflite::micro::GetTensorShape(input1),
+      tflite::micro::GetTensorShape(input2), &op_params);
+
+  if (output->type == kTfLiteInt8) {
+    if (need_broadcast) {
+      reference_integer_ops::BroadcastMul4DSlow(
+          op_params, tflite::micro::GetTensorShape(input1),
+          tflite::micro::GetTensorData<int8_t>(input1),
+          tflite::micro::GetTensorShape(input2),
+          tflite::micro::GetTensorData<int8_t>(input2),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<int8_t>(output));
+    } else {
+      reference_integer_ops::Mul(op_params,
+                                 tflite::micro::GetTensorShape(input1),
+                                 tflite::micro::GetTensorData<int8_t>(input1),
+                                 tflite::micro::GetTensorShape(input2),
+                                 tflite::micro::GetTensorData<int8_t>(input2),
+                                 tflite::micro::GetTensorShape(output),
+                                 tflite::micro::GetTensorData<int8_t>(output));
+    }
+  } else if (output->type == kTfLiteUInt8) {
+    if (need_broadcast) {
+      reference_integer_ops::BroadcastMul4DSlow(
+          op_params, tflite::micro::GetTensorShape(input1),
+          tflite::micro::GetTensorData<uint8_t>(input1),
+          tflite::micro::GetTensorShape(input2),
+          tflite::micro::GetTensorData<uint8_t>(input2),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<uint8_t>(output));
+    } else {
+      reference_integer_ops::Mul(op_params,
+                                 tflite::micro::GetTensorShape(input1),
+                                 tflite::micro::GetTensorData<uint8_t>(input1),
+                                 tflite::micro::GetTensorShape(input2),
+                                 tflite::micro::GetTensorData<uint8_t>(input2),
+                                 tflite::micro::GetTensorShape(output),
+                                 tflite::micro::GetTensorData<uint8_t>(output));
    }
-#undef TF_LITE_MUL
  }
 }

 void EvalFloat(TfLiteContext* context, TfLiteNode* node,
-               TfLiteMulParams* params, OpData* data,
-               const TfLiteTensor* input1, const TfLiteTensor* input2,
-               TfLiteTensor* output) {
-  float output_activation_min, output_activation_max;
-  CalculateActivationRange(params->activation, &output_activation_min,
-                           &output_activation_max);
-  tflite::ArithmeticParams op_params;
-  SetActivationParams(output_activation_min, output_activation_max, &op_params);
+               TfLiteMulParams* params, const OpData* data,
+               const TfLiteEvalTensor* input1, const TfLiteEvalTensor* input2,
+               TfLiteEvalTensor* output) {
+  tflite::ArithmeticParams op_params = {};
+  op_params.float_activation_min = data->output_activation_min_f32;
+  op_params.float_activation_max = data->output_activation_max_f32;

  bool need_broadcast = reference_ops::ProcessBroadcastShapes(
-      GetTensorShape(input1), GetTensorShape(input2), &op_params);
-#define TF_LITE_MUL(opname)                                                   \
-  reference_ops::opname(op_params, GetTensorShape(input1),                    \
-                        GetTensorData<float>(input1), GetTensorShape(input2), \
-                        GetTensorData<float>(input2), GetTensorShape(output), \
-                        GetTensorData<float>(output));
+      tflite::micro::GetTensorShape(input1),
+      tflite::micro::GetTensorShape(input2), &op_params);

  if (need_broadcast) {
-    TF_LITE_MUL(BroadcastMul4DSlow);
+    reference_ops::BroadcastMul4DSlow(
+        op_params, tflite::micro::GetTensorShape(input1),
+        tflite::micro::GetTensorData<float>(input1),
+        tflite::micro::GetTensorShape(input2),
+        tflite::micro::GetTensorData<float>(input2),
+        tflite::micro::GetTensorShape(output),
+        tflite::micro::GetTensorData<float>(output));
  } else {
-    TF_LITE_MUL(Mul);
+    reference_ops::Mul(op_params, tflite::micro::GetTensorShape(input1),
+                       tflite::micro::GetTensorData<float>(input1),
+                       tflite::micro::GetTensorShape(input2),
+                       tflite::micro::GetTensorData<float>(input2),
+                       tflite::micro::GetTensorShape(output),
+                       tflite::micro::GetTensorData<float>(output));
  }
-#undef TF_LITE_MUL
+}
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+  auto* params = reinterpret_cast<TfLiteMulParams*>(node->builtin_data);
+
+  TFLITE_DCHECK(node->user_data != nullptr);
+  OpData* data = static_cast<OpData*>(node->user_data);
+
+  return CalculateOpData(context, node, params, data);
 }

 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->builtin_data != nullptr);
  auto* params = reinterpret_cast<TfLiteMulParams*>(node->builtin_data);
-  OpData data;

-  const TfLiteTensor* input1 = GetInput(context, node, kInput1Tensor);
-  const TfLiteTensor* input2 = GetInput(context, node, kInput2Tensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpData* data = static_cast<const OpData*>(node->user_data);

-  CalculateOpData(context, node, params, &data);
+  const TfLiteEvalTensor* input1 =
+      tflite::micro::GetEvalInput(context, node, kInput1Tensor);
+  const TfLiteEvalTensor* input2 =
+      tflite::micro::GetEvalInput(context, node, kInput2Tensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);

  switch (input1->type) {
    case kTfLiteUInt8:
    case kTfLiteInt8:
-      EvalQuantized(context, node, params, &data, input1, input2, output);
+      EvalQuantized(context, node, data, input1, input2, output);
      break;
    case kTfLiteFloat32:
-      EvalFloat(context, node, params, &data, input1, input2, output);
+      EvalFloat(context, node, params, data, input1, input2, output);
      break;
    default:
      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
@ -158,16 +217,15 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }
 }  // namespace mul

-TfLiteRegistration* Register_MUL() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
-                                 /*free=*/nullptr,
-                                 /*prepare=*/nullptr,
-                                 /*invoke=*/mul::Eval,
-                                 /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
-                                 /*version=*/0};
-  return &r;
+TfLiteRegistration Register_MUL() {
+  return {/*init=*/mul::Init,
+          /*free=*/nullptr,
+          /*prepare=*/mul::Prepare,
+          /*invoke=*/mul::Eval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
 }

 }  // namespace micro
--- a/libs/tensorflow/tensorflow/lite/micro/kernels/neg.cc
+++ b/libs/tensorflow/tensorflow/lite/micro/kernels/neg.cc
@ -17,7 +17,7 @@ limitations under the License.

 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
-#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"

 namespace tflite {
 namespace ops {
@ -28,14 +28,17 @@ constexpr int kInputTensor = 0;
 constexpr int kOutputTensor = 0;

 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
  switch (input->type) {
    // TODO(wangtz): handle for kTfLiteInt8
    case kTfLiteFloat32:
-      reference_ops::Negate(GetTensorShape(input), GetTensorData<float>(input),
-                            GetTensorShape(output),
-                            GetTensorData<float>(output));
+      reference_ops::Negate(tflite::micro::GetTensorShape(input),
+                            tflite::micro::GetTensorData<float>(input),
+                            tflite::micro::GetTensorShape(output),
+                            tflite::micro::GetTensorData<float>(output));
      break;
    default:
      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
@ -47,16 +50,15 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {

 }  // namespace neg

-TfLiteRegistration* Register_NEG() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
-                                 /*free=*/nullptr,
-                                 /*prepare=*/nullptr,
-                                 /*invoke=*/neg::Eval,
-                                 /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
-                                 /*version=*/0};
-  return &r;
+TfLiteRegistration Register_NEG() {
+  return {/*init=*/nullptr,
+          /*free=*/nullptr,
+          /*prepare=*/nullptr,
+          /*invoke=*/neg::Eval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
 }

 }  // namespace micro
--- a/libs/tensorflow/tensorflow/lite/micro/kernels/pack.cc
+++ b/libs/tensorflow/tensorflow/lite/micro/kernels/pack.cc
@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
-#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"

 namespace tflite {
 namespace ops {
@ -28,9 +28,11 @@ constexpr int kOutputTensor = 0;

 template <typename T>
 TfLiteStatus PackImpl(TfLiteContext* context, TfLiteNode* node,
-                      TfLiteTensor* output, int values_count, int axis) {
+                      TfLiteEvalTensor* output, int values_count, int axis) {
+  const TfLiteEvalTensor* input0 =
+      tflite::micro::GetEvalInput(context, node, 0);
+
  const int dimensions = output->dims->size;
-  const TfLiteTensor* input0 = GetInput(context, node, 0);
  const TfLiteIntArray* input_dims = input0->dims;
  const TfLiteIntArray* output_dims = output->dims;

@ -52,11 +54,11 @@ TfLiteStatus PackImpl(TfLiteContext* context, TfLiteNode* node,
  }
  TFLITE_DCHECK_EQ(input_size, copy_size * outer_size);

-  T* output_data = GetTensorData<T>(output);
+  T* output_data = tflite::micro::GetTensorData<T>(output);

  for (int i = 0; i < values_count; ++i) {
-    const TfLiteTensor* t = GetInput(context, node, i);
-    const T* input_data = GetTensorData<T>(t);
+    const TfLiteEvalTensor* t = tflite::micro::GetEvalInput(context, node, i);
+    const T* input_data = tflite::micro::GetTensorData<T>(t);
    for (int k = 0; k < outer_size; ++k) {
      const T* input_ptr = input_data + copy_size * k;
      int loc = k * values_count * copy_size + i * copy_size;
@ -72,7 +74,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
  const TfLitePackParams* data =
      reinterpret_cast<TfLitePackParams*>(node->builtin_data);

-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);

  switch (output->type) {
    case kTfLiteFloat32: {
@ -108,16 +111,15 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace
 }  // namespace pack

-TfLiteRegistration* Register_PACK() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
-                                 /*free=*/nullptr,
-                                 /*prepare=*/nullptr,
-                                 /*invoke=*/pack::Eval,
-                                 /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
-                                 /*version=*/0};
-  return &r;
+TfLiteRegistration Register_PACK() {
+  return {/*init=*/nullptr,
+          /*free=*/nullptr,
+          /*prepare=*/nullptr,
+          /*invoke=*/pack::Eval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
 }

 }  // namespace micro
--- a/libs/tensorflow/tensorflow/lite/micro/kernels/pad.cc
+++ b/libs/tensorflow/tensorflow/lite/micro/kernels/pad.cc
@ -16,189 +16,205 @@ limitations under the License.

 #include <string.h>

-#include "tensorflow/lite/kernels/internal/types.h"
-
-#ifdef MEMORY_SANITIZER
-#include <sanitizer/msan_interface.h>
-#else
-#define __msan_check_mem_is_initialized(ptr, size)
-#endif
-
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/portable_tensor.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"

 namespace tflite {
 namespace ops {
 namespace micro {
 namespace pad {
+namespace {

-struct PadContext {
-  PadContext(TfLiteContext* context, TfLiteNode* node) {
-    input = GetInput(context, node, 0);
-    paddings = GetInput(context, node, 1);
-    constant_values = nullptr;
-    if (NumInputs(node) == 3) {
-      constant_values = GetOptionalInputTensor(context, node, 2);
-    } else {
-      constant_values = nullptr;
-    }
-    output = GetOutput(context, node, 0);
-    dims = NumDimensions(input);
-
-    resizing_category = ResizingCategory::kGenericResize;
-    const int paddings_total = GetTensorShape(paddings).FlatSize();
-    const int32* paddings_data = GetTensorData<int32>(paddings);
-    // Paddings will be a n,2 array, and we need to detect 4D arrays with the
-    // pattern { {0,0}, {a, b}, {c, d}, {0,0} }.
-    if (IsConstantTensor(paddings) && paddings_total == 8 &&
-        (paddings_data[0] == 0 && paddings_data[1] == 0) &&
-        (paddings_data[6] == 0 && paddings_data[7] == 0)) {
-      resizing_category = ResizingCategory::kImageStyle;
-    }
-  }
-  const TfLiteTensor* constant_values;
-  const TfLiteTensor* input;
-  const TfLiteTensor* paddings;
-  TfLiteTensor* output;
-  int dims;
-  ResizingCategory resizing_category;
+struct OpData {
+  PadParams params;
+  int32_t output_zero_point;
 };

+}  // namespace
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
+}
+
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  OpData* data = static_cast<OpData*>(node->user_data);
+
  TF_LITE_ENSURE(context, NumInputs(node) == 2 || NumInputs(node) == 3);
  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);

-  PadContext op_context(context, node);
-  TF_LITE_ENSURE_EQ(context, op_context.input->type, op_context.output->type);
-  if (op_context.constant_values != nullptr) {
-    TF_LITE_ENSURE_EQ(context, op_context.input->type,
-                      op_context.constant_values->type);
+  const TfLiteTensor* input = GetInput(context, node, /*index=*/0);
+  const TfLiteTensor* paddings = GetInput(context, node, /*index=*/1);
+  const TfLiteTensor* constant_values =
+      NumInputs(node) == 3 ? GetInput(context, node, /*index=*/2) : nullptr;
+  TfLiteTensor* output = GetOutput(context, node, /*index=*/0);
+
+  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+
+  // Current implementations rely on the inputs being <= 4D.
+  TF_LITE_ENSURE(context, NumDimensions(input) <=
+                              reference_ops::PadKernelMaxDimensionCount());
+
+  if (constant_values != nullptr) {
+    TF_LITE_ENSURE_EQ(context, input->type, constant_values->type);
+    // Ensure that constant_values is a scalar.
+    TF_LITE_ENSURE_EQ(context, NumElements(constant_values), 1);
  }

  // There must be a pair of paddings for each output dimension.
-  TF_LITE_ENSURE_EQ(context, GetTensorShape(op_context.paddings).FlatSize(),
-                    op_context.output->dims->size * 2);
+  TF_LITE_ENSURE_EQ(context, GetTensorShape(paddings).FlatSize(),
+                    output->dims->size * 2);

  // On Micro, outputs must be properly sized by the converter.
-  const int32* paddings_data = GetTensorData<int32>(op_context.paddings);
-  for (int i = 0; i < op_context.output->dims->size; i++) {
-    int output_dim = op_context.output->dims->data[i];
-    int expected_dim = op_context.input->dims->data[i] + paddings_data[i * 2] +
-                       paddings_data[i * 2 + 1];
+  // NOTE: This data is only available because the paddings buffer is stored in
+  // the flatbuffer:
+  TF_LITE_ENSURE(context, IsConstantTensor(paddings));
+  const int32_t* paddings_data = GetTensorData<int32_t>(paddings);
+  for (int i = 0; i < output->dims->size; i++) {
+    int output_dim = output->dims->data[i];
+    int expected_dim =
+        input->dims->data[i] + paddings_data[i * 2] + paddings_data[i * 2 + 1];
    TF_LITE_ENSURE_EQ(context, output_dim, expected_dim);
  }

-  // Current implementations rely on the inputs being <= 4D.
-  TF_LITE_ENSURE(
-      context, op_context.dims <= reference_ops::PadKernelMaxDimensionCount());
-  TF_LITE_ENSURE(context, IsConstantTensor(op_context.paddings));
+  // Calculate OpData:
+  data->params.resizing_category = ResizingCategory::kGenericResize;
+  const int paddings_total = GetTensorShape(paddings).FlatSize();
+  if (paddings_total == 8 && (paddings_data[0] == 0 && paddings_data[1] == 0) &&
+      (paddings_data[6] == 0 && paddings_data[7] == 0)) {
+    data->params.resizing_category = ResizingCategory::kImageStyle;
+  }
+
+  const int num_input_dimensions = NumDimensions(input);
+  data->params.left_padding_count = num_input_dimensions;
+  data->params.right_padding_count = num_input_dimensions;
+
+  for (int idx = num_input_dimensions - 1; idx >= 0; --idx) {
+    data->params.left_padding[idx] = paddings_data[idx * 2];
+    data->params.right_padding[idx] = paddings_data[idx * 2 + 1];
+  }
+
+  if (input->type == kTfLiteInt8 || input->type == kTfLiteUInt8) {
+    if (constant_values == nullptr) {
+      // Quantized Pad requires that 0 is represented in the quantized
+      // range.
+      if (input->type == kTfLiteUInt8) {
+        TF_LITE_ENSURE(context, output->params.zero_point >=
+                                    std::numeric_limits<uint8_t>::min());
+        TF_LITE_ENSURE(context, output->params.zero_point <=
+                                    std::numeric_limits<uint8_t>::max());
+      } else {
+        TF_LITE_ENSURE(context, output->params.zero_point >=
+                                    std::numeric_limits<int8_t>::min());
+        TF_LITE_ENSURE(context, output->params.zero_point <=
+                                    std::numeric_limits<int8_t>::max());
+      }
+    } else {
+      // Quantized Pad requires that 'constant_values' is represented in the
+      // same quantized range as the input and output tensors.
+      TF_LITE_ENSURE_EQ(context, output->params.zero_point,
+                        constant_values->params.zero_point);
+      TF_LITE_ENSURE_EQ(context, static_cast<double>(output->params.scale),
+                        static_cast<double>(constant_values->params.scale));
+    }
+    data->output_zero_point = output->params.zero_point;
+  }
+
  return kTfLiteOk;
 }

 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  PadContext op_context(context, node);
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpData* data = static_cast<const OpData*>(node->user_data);

-  if (op_context.constant_values != nullptr) {
-    // Ensure that constant_values is a scalar.
-    TF_LITE_ENSURE_EQ(context, NumElements(op_context.constant_values), 1);
-  }
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, /*index=*/0);
+  const TfLiteEvalTensor* constant_values =
+      NumInputs(node) == 3
+          ? tflite::micro::GetEvalInput(context, node, /*index=*/2)
+          : nullptr;
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, /*index=*/0);

-  // Create before and after padding arrays that are accepted by the kernel.
-  const int32* paddings_data = GetTensorData<int32>(op_context.paddings);
-
-  tflite::PadParams op_params;
-  memset(&op_params, 0, sizeof(PadParams));
-  op_params.left_padding_count = op_context.dims;
-  op_params.right_padding_count = op_context.dims;
-
-  for (int idx = op_context.dims - 1; idx >= 0; --idx) {
-    op_params.left_padding[idx] = paddings_data[idx * 2];
-    op_params.right_padding[idx] = paddings_data[idx * 2 + 1];
-  }
-
-#define TF_LITE_PAD(type, op_name, scalar, pad_value)                     \
-  const scalar pad_value_copy = pad_value;                                \
-                                                                          \
-  type::op_name(op_params, GetTensorShape(op_context.input),              \
-                GetTensorData<scalar>(op_context.input), &pad_value_copy, \
-                GetTensorShape(op_context.output),                        \
-                GetTensorData<scalar>(op_context.output))
-  switch (op_context.input->type) {
+  switch (input->type) {
    case kTfLiteFloat32: {
-      float pad_value = op_context.constant_values == nullptr
-                            ? 0.f
-                            : *GetTensorData<float>(op_context.constant_values);
-      if (op_context.resizing_category == ResizingCategory::kImageStyle) {
-        TF_LITE_PAD(reference_ops, PadImageStyle, float, pad_value);
+      float pad_value =
+          constant_values == nullptr
+              ? 0.f
+              : *tflite::micro::GetTensorData<float>(constant_values);
+      if (data->params.resizing_category == ResizingCategory::kImageStyle) {
+        reference_ops::PadImageStyle(
+            data->params, tflite::micro::GetTensorShape(input),
+            tflite::micro::GetTensorData<float>(input), &pad_value,
+            tflite::micro::GetTensorShape(output),
+            tflite::micro::GetTensorData<float>(output));
      } else {
-        TF_LITE_PAD(reference_ops, Pad, float, pad_value);
+        reference_ops::Pad(data->params, tflite::micro::GetTensorShape(input),
+                           tflite::micro::GetTensorData<float>(input),
+                           &pad_value, tflite::micro::GetTensorShape(output),
+                           tflite::micro::GetTensorData<float>(output));
      }
    } break;
    case kTfLiteUInt8: {
      uint8_t pad_value;
-      if (op_context.constant_values == nullptr) {
-        // Quantized Pad requires that 0 is represented in the quantized
-        // range.
-        TF_LITE_ENSURE(context, op_context.output->params.zero_point >=
-                                    std::numeric_limits<uint8_t>::min());
-        TF_LITE_ENSURE(context, op_context.output->params.zero_point <=
-                                    std::numeric_limits<uint8_t>::max());
-        pad_value = static_cast<uint8_t>(op_context.output->params.zero_point);
+      if (constant_values == nullptr) {
+        pad_value = static_cast<uint8_t>(data->output_zero_point);
      } else {
-        // Quantized Pad requires that 'constant_values' is represented in the
-        // same quantized range as the input and output tensors.
-        TF_LITE_ENSURE_EQ(context, op_context.output->params.zero_point,
-                          op_context.constant_values->params.zero_point);
-        TF_LITE_ENSURE_EQ(
-            context, static_cast<double>(op_context.output->params.scale),
-            static_cast<double>(op_context.constant_values->params.scale));
-        pad_value = *GetTensorData<uint8_t>(op_context.constant_values);
+        pad_value = *tflite::micro::GetTensorData<uint8_t>(constant_values);
      }
-      if (op_context.resizing_category == ResizingCategory::kImageStyle) {
-        TF_LITE_PAD(reference_ops, PadImageStyle, uint8_t, pad_value);
+      if (data->params.resizing_category == ResizingCategory::kImageStyle) {
+        reference_ops::PadImageStyle(
+            data->params, tflite::micro::GetTensorShape(input),
+            tflite::micro::GetTensorData<uint8_t>(input), &pad_value,
+            tflite::micro::GetTensorShape(output),
+            tflite::micro::GetTensorData<uint8_t>(output));
      } else {
-        TF_LITE_PAD(reference_ops, Pad, uint8_t, pad_value);
+        reference_ops::Pad(data->params, tflite::micro::GetTensorShape(input),
+                           tflite::micro::GetTensorData<uint8_t>(input),
+                           &pad_value, tflite::micro::GetTensorShape(output),
+                           tflite::micro::GetTensorData<uint8_t>(output));
      }
    } break;
    case kTfLiteInt8: {
      int8_t pad_value;
-      if (op_context.constant_values == nullptr) {
-        // Quantized Pad requires that 0 is represented in the quantized
-        // range.
-        TF_LITE_ENSURE(context, op_context.output->params.zero_point >=
-                                    std::numeric_limits<int8_t>::min());
-        TF_LITE_ENSURE(context, op_context.output->params.zero_point <=
-                                    std::numeric_limits<int8_t>::max());
-        pad_value = static_cast<int8_t>(op_context.output->params.zero_point);
+      if (constant_values == nullptr) {
+        pad_value = static_cast<uint8_t>(data->output_zero_point);
      } else {
-        // Quantized Pad requires that 'constant_values' is represented in the
-        // same quantized range as the input and output tensors.
-        TF_LITE_ENSURE_EQ(context, op_context.output->params.zero_point,
-                          op_context.constant_values->params.zero_point);
-        TF_LITE_ENSURE(context, op_context.output->params.scale ==
-                                    op_context.constant_values->params.scale);
-        pad_value = *GetTensorData<int8_t>(op_context.constant_values);
+        pad_value = *tflite::micro::GetTensorData<int8_t>(constant_values);
      }
-      if (op_context.resizing_category == ResizingCategory::kImageStyle) {
-        TF_LITE_PAD(reference_ops, PadImageStyle, int8_t, pad_value);
+      if (data->params.resizing_category == ResizingCategory::kImageStyle) {
+        reference_ops::PadImageStyle(
+            data->params, tflite::micro::GetTensorShape(input),
+            tflite::micro::GetTensorData<int8_t>(input), &pad_value,
+            tflite::micro::GetTensorShape(output),
+            tflite::micro::GetTensorData<int8_t>(output));
      } else {
-        TF_LITE_PAD(reference_ops, Pad, int8_t, pad_value);
+        reference_ops::Pad(data->params, tflite::micro::GetTensorShape(input),
+                           tflite::micro::GetTensorData<int8_t>(input),
+                           &pad_value, tflite::micro::GetTensorShape(output),
+                           tflite::micro::GetTensorData<int8_t>(output));
      }
    } break;
    case kTfLiteInt32: {
      int32_t pad_value =
-          op_context.constant_values == nullptr
+          constant_values == nullptr
              ? 0
-              : *GetTensorData<int32_t>(op_context.constant_values);
-      TF_LITE_PAD(reference_ops, Pad, int32_t, pad_value);
+              : *tflite::micro::GetTensorData<int32_t>(constant_values);
+      reference_ops::Pad(data->params, tflite::micro::GetTensorShape(input),
+                         tflite::micro::GetTensorData<int32_t>(input),
+                         &pad_value, tflite::micro::GetTensorShape(output),
+                         tflite::micro::GetTensorData<int32_t>(output));
    } break;
    default:

      TF_LITE_KERNEL_LOG(context, "Type %s not currently supported by Pad.",
-                         TfLiteTypeGetName(op_context.input->type));
+                         TfLiteTypeGetName(input->type));
      return kTfLiteError;
  }
 #undef TF_LITE_PAD
@ -207,29 +223,27 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {

 }  // namespace pad

-TfLiteRegistration* Register_PAD() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
-                                 /*free=*/nullptr,
-                                 /*prepare=*/pad::Prepare,
-                                 /*invoke=*/pad::Eval,
-                                 /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
-                                 /*version=*/0};
-  return &r;
+TfLiteRegistration Register_PAD() {
+  return {/*init=*/pad::Init,
+          /*free=*/nullptr,
+          /*prepare=*/pad::Prepare,
+          /*invoke=*/pad::Eval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
 }

 // Also register Pad as PadV2.
-TfLiteRegistration* Register_PADV2() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
-                                 /*free=*/nullptr,
-                                 /*prepare=*/pad::Prepare,
-                                 /*invoke=*/pad::Eval,
-                                 /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
-                                 /*version=*/0};
-  return &r;
+TfLiteRegistration Register_PADV2() {
+  return {/*init=*/pad::Init,
+          /*free=*/nullptr,
+          /*prepare=*/pad::Prepare,
+          /*invoke=*/pad::Eval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
 }

 }  // namespace micro
--- a/libs/tensorflow/tensorflow/lite/micro/kernels/pooling.cc
+++ b/libs/tensorflow/tensorflow/lite/micro/kernels/pooling.cc
@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/padding.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"

 namespace tflite {
 namespace ops {
@ -32,6 +33,10 @@ constexpr int kOutputTensor = 0;

 struct OpData {
  TfLitePaddingValues padding;
+  int32_t activation_min;
+  int32_t activation_max;
+  float activation_min_f32;
+  float activation_max_f32;
 };

 TfLiteStatus CalculateOpData(const TfLiteContext* context,
@ -55,11 +60,7 @@ TfLiteStatus CalculateOpData(const TfLiteContext* context,

 void AverageEvalFloat(const TfLiteContext* context, const TfLiteNode* node,
                      const TfLitePoolParams* params, const OpData* data,
-                      const TfLiteTensor* input, TfLiteTensor* output) {
-  float activation_min, activation_max;
-  CalculateActivationRange(params->activation, &activation_min,
-                           &activation_max);
-
+                      const TfLiteEvalTensor* input, TfLiteEvalTensor* output) {
  PoolParams op_params;
  op_params.stride_height = params->stride_height;
  op_params.stride_width = params->stride_width;
@ -67,20 +68,19 @@ void AverageEvalFloat(const TfLiteContext* context, const TfLiteNode* node,
  op_params.filter_width = params->filter_width;
  op_params.padding_values.height = data->padding.height;
  op_params.padding_values.width = data->padding.width;
-  op_params.float_activation_min = activation_min;
-  op_params.float_activation_max = activation_max;
-  reference_ops::AveragePool(
-      op_params, GetTensorShape(input), GetTensorData<float>(input),
-      GetTensorShape(output), GetTensorData<float>(output));
+  op_params.float_activation_min = data->activation_min_f32;
+  op_params.float_activation_max = data->activation_max_f32;
+  reference_ops::AveragePool(op_params, tflite::micro::GetTensorShape(input),
+                             tflite::micro::GetTensorData<float>(input),
+                             tflite::micro::GetTensorShape(output),
+                             tflite::micro::GetTensorData<float>(output));
 }

 void AverageEvalQuantized(TfLiteContext* context, const TfLiteNode* node,
                          const TfLitePoolParams* params, const OpData* data,
-                          const TfLiteTensor* input, TfLiteTensor* output) {
+                          const TfLiteEvalTensor* input,
+                          TfLiteEvalTensor* output) {
  TFLITE_DCHECK(input->type == kTfLiteUInt8 || input->type == kTfLiteInt8);
-  int32_t activation_min, activation_max;
-  (void)CalculateActivationRangeQuantized(context, params->activation, output,
-                                          &activation_min, &activation_max);

  PoolParams op_params;
  op_params.stride_height = params->stride_height;
@ -89,27 +89,26 @@ void AverageEvalQuantized(TfLiteContext* context, const TfLiteNode* node,
  op_params.filter_width = params->filter_width;
  op_params.padding_values.height = data->padding.height;
  op_params.padding_values.width = data->padding.width;
-  op_params.quantized_activation_min = activation_min;
-  op_params.quantized_activation_max = activation_max;
+  op_params.quantized_activation_min = data->activation_min;
+  op_params.quantized_activation_max = data->activation_max;

  if (input->type == kTfLiteUInt8) {
-    reference_ops::AveragePool(
-        op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
-        GetTensorShape(output), GetTensorData<uint8_t>(output));
+    reference_ops::AveragePool(op_params, tflite::micro::GetTensorShape(input),
+                               tflite::micro::GetTensorData<uint8_t>(input),
+                               tflite::micro::GetTensorShape(output),
+                               tflite::micro::GetTensorData<uint8_t>(output));
  } else {
    reference_integer_ops::AveragePool(
-        op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
-        GetTensorShape(output), GetTensorData<int8_t>(output));
+        op_params, tflite::micro::GetTensorShape(input),
+        tflite::micro::GetTensorData<int8_t>(input),
+        tflite::micro::GetTensorShape(output),
+        tflite::micro::GetTensorData<int8_t>(output));
  }
 }

 void MaxEvalFloat(TfLiteContext* context, TfLiteNode* node,
-                  TfLitePoolParams* params, OpData* data,
-                  const TfLiteTensor* input, TfLiteTensor* output) {
-  float activation_min, activation_max;
-  CalculateActivationRange(params->activation, &activation_min,
-                           &activation_max);
-
+                  TfLitePoolParams* params, const OpData* data,
+                  const TfLiteEvalTensor* input, TfLiteEvalTensor* output) {
  tflite::PoolParams op_params;
  op_params.stride_height = params->stride_height;
  op_params.stride_width = params->stride_width;
@ -117,22 +116,17 @@ void MaxEvalFloat(TfLiteContext* context, TfLiteNode* node,
  op_params.filter_width = params->filter_width;
  op_params.padding_values.height = data->padding.height;
  op_params.padding_values.width = data->padding.width;
-  op_params.float_activation_min = activation_min;
-  op_params.float_activation_max = activation_max;
-  reference_ops::MaxPool(op_params, GetTensorShape(input),
-                         GetTensorData<float>(input), GetTensorShape(output),
-                         GetTensorData<float>(output));
+  op_params.float_activation_min = data->activation_min_f32;
+  op_params.float_activation_max = data->activation_max_f32;
+  reference_ops::MaxPool(op_params, tflite::micro::GetTensorShape(input),
+                         tflite::micro::GetTensorData<float>(input),
+                         tflite::micro::GetTensorShape(output),
+                         tflite::micro::GetTensorData<float>(output));
 }

 void MaxEvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                      TfLitePoolParams* params, OpData* data,
-                      const TfLiteTensor* input, TfLiteTensor* output) {
-  TFLITE_DCHECK(input->type == kTfLiteUInt8 || input->type == kTfLiteInt8);
-
-  int32_t activation_min, activation_max;
-  (void)CalculateActivationRangeQuantized(context, params->activation, output,
-                                          &activation_min, &activation_max);
-
+                      TfLitePoolParams* params, const OpData* data,
+                      const TfLiteEvalTensor* input, TfLiteEvalTensor* output) {
  tflite::PoolParams op_params;
  op_params.stride_height = params->stride_height;
  op_params.stride_width = params->stride_width;
@ -140,39 +134,44 @@ void MaxEvalQuantized(TfLiteContext* context, TfLiteNode* node,
  op_params.filter_width = params->filter_width;
  op_params.padding_values.height = data->padding.height;
  op_params.padding_values.width = data->padding.width;
-  op_params.quantized_activation_min = activation_min;
-  op_params.quantized_activation_max = activation_max;
+  op_params.quantized_activation_min = data->activation_min;
+  op_params.quantized_activation_max = data->activation_max;

  if (input->type == kTfLiteUInt8) {
-    reference_ops::MaxPool(
-        op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
-        GetTensorShape(output), GetTensorData<uint8_t>(output));
+    reference_ops::MaxPool(op_params, tflite::micro::GetTensorShape(input),
+                           tflite::micro::GetTensorData<uint8_t>(input),
+                           tflite::micro::GetTensorShape(output),
+                           tflite::micro::GetTensorData<uint8_t>(output));
  } else {
    reference_integer_ops::MaxPool(
-        op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
-        GetTensorShape(output), GetTensorData<int8_t>(output));
+        op_params, tflite::micro::GetTensorShape(input),
+        tflite::micro::GetTensorData<int8_t>(input),
+        tflite::micro::GetTensorShape(output),
+        tflite::micro::GetTensorData<int8_t>(output));
  }
 }
 }  // namespace

-
 TfLiteStatus AverageEval(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->builtin_data != nullptr);
  auto* params = reinterpret_cast<TfLitePoolParams*>(node->builtin_data);
-  OpData data;

-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpData* data = static_cast<const OpData*>(node->user_data);

-  TF_LITE_ENSURE_STATUS(CalculateOpData(context, params, input, output, &data));
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);

  // Inputs and outputs share the same type, guaranteed by the converter.
  switch (input->type) {
    case kTfLiteFloat32:
-      AverageEvalFloat(context, node, params, &data, input, output);
+      AverageEvalFloat(context, node, params, data, input, output);
      break;
    case kTfLiteUInt8:
    case kTfLiteInt8:
-      AverageEvalQuantized(context, node, params, &data, input, output);
+      AverageEvalQuantized(context, node, params, data, input, output);
      break;
    default:
      TF_LITE_KERNEL_LOG(context, "Input type %s is not currently supported",
@ -183,21 +182,24 @@ TfLiteStatus AverageEval(TfLiteContext* context, TfLiteNode* node) {
 }

 TfLiteStatus MaxEval(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->builtin_data != nullptr);
  auto* params = reinterpret_cast<TfLitePoolParams*>(node->builtin_data);
-  OpData data;

-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpData* data = static_cast<const OpData*>(node->user_data);

-  TF_LITE_ENSURE_STATUS(CalculateOpData(context, params, input, output, &data));
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);

  switch (input->type) {
    case kTfLiteFloat32:
-      MaxEvalFloat(context, node, params, &data, input, output);
+      MaxEvalFloat(context, node, params, data, input, output);
      break;
    case kTfLiteUInt8:
    case kTfLiteInt8:
-      MaxEvalQuantized(context, node, params, &data, input, output);
+      MaxEvalQuantized(context, node, params, data, input, output);
      break;
    default:
      TF_LITE_KERNEL_LOG(context, "Type %s not currently supported.",
@ -207,30 +209,57 @@ TfLiteStatus MaxEval(TfLiteContext* context, TfLiteNode* node) {
  return kTfLiteOk;
 }

-}  // namespace pooling
-
-TfLiteRegistration* Register_AVERAGE_POOL_2D() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
-                                 /*free=*/nullptr,
-                                 /*prepare=*/nullptr,
-                                 /*invoke=*/pooling::AverageEval,
-                                 /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
-                                 /*version=*/0};
-  return &r;
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
 }

-TfLiteRegistration* Register_MAX_POOL_2D() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
-                                 /*free=*/nullptr,
-                                 /*prepare=*/nullptr,
-                                 /*invoke=*/pooling::MaxEval,
-                                 /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
-                                 /*version=*/0};
-  return &r;
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+  auto* params = reinterpret_cast<TfLitePoolParams*>(node->builtin_data);
+
+  TFLITE_DCHECK(node->user_data != nullptr);
+  OpData* data = static_cast<OpData*>(node->user_data);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  TF_LITE_ENSURE_STATUS(CalculateOpData(context, params, input, output, data));
+
+  if (input->type == kTfLiteFloat32) {
+    CalculateActivationRange(params->activation, &data->activation_min_f32,
+                             &data->activation_max_f32);
+  } else if (input->type == kTfLiteInt8 || input->type == kTfLiteUInt8) {
+    CalculateActivationRangeQuantized(context, params->activation, output,
+                                      &data->activation_min,
+                                      &data->activation_max);
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace pooling
+
+TfLiteRegistration Register_AVERAGE_POOL_2D() {
+  return {/*init=*/pooling::Init,
+          /*free=*/nullptr,
+          /*prepare=*/pooling::Prepare,
+          /*invoke=*/pooling::AverageEval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
+}
+
+TfLiteRegistration Register_MAX_POOL_2D() {
+  return {/*init=*/pooling::Init,
+          /*free=*/nullptr,
+          /*prepare=*/pooling::Prepare,
+          /*invoke=*/pooling::MaxEval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
 }

 }  // namespace micro
--- a/libs/tensorflow/tensorflow/lite/micro/kernels/prelu.cc
+++ b/libs/tensorflow/tensorflow/lite/micro/kernels/prelu.cc
@ -15,20 +15,45 @@ limitations under the License.

 #include "tensorflow/lite/kernels/internal/reference/prelu.h"

+#include <cstdint>
+
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"

 namespace tflite {
 namespace ops {
 namespace micro {
 namespace activations {
+namespace {
+
+TfLiteStatus CalculatePreluParams(const TfLiteTensor* input,
+                                  const TfLiteTensor* alpha,
+                                  TfLiteTensor* output, PreluParams* params) {
+  if (output->type == kTfLiteInt8 || output->type == kTfLiteUInt8 ||
+      output->type == kTfLiteInt16) {
+    double real_multiplier_1 = static_cast<double>(input->params.scale) /
+                               static_cast<double>(output->params.scale);
+    double real_multiplier_2 = static_cast<double>(input->params.scale) *
+                               static_cast<double>(alpha->params.scale) /
+                               static_cast<double>(output->params.scale);
+    QuantizeMultiplier(real_multiplier_1, &params->output_multiplier_1,
+                       &params->output_shift_1);
+    QuantizeMultiplier(real_multiplier_2, &params->output_multiplier_2,
+                       &params->output_shift_2);
+
+    params->input_offset = -input->params.zero_point;
+    params->alpha_offset = -alpha->params.zero_point;
+    params->output_offset = output->params.zero_point;
+  }

-TfLiteStatus PreluPrepare(TfLiteContext* context, TfLiteNode* node) {
  return kTfLiteOk;
 }

+}  // namespace
+
 inline void BroadcastPrelu4DSlowFloat(
    const RuntimeShape& unextended_input1_shape, const float* input1_data,
    const RuntimeShape& unextended_input2_shape, const float* input2_data,
@ -60,51 +85,64 @@ inline void BroadcastPrelu4DSlowFloat(
  }
 }

-TfLiteStatus PreluEval(TfLiteContext* context, TfLiteNode* node) {
+void* PreluInit(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(PreluParams));
+}
+
+TfLiteStatus PreluPrepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  PreluParams* params = static_cast<PreluParams*>(node->user_data);
+
  const TfLiteTensor* input = GetInput(context, node, 0);
  const TfLiteTensor* alpha = GetInput(context, node, 1);
  TfLiteTensor* output = GetOutput(context, node, 0);
-  int32_t output_multiplier_1 = 0;
-  int output_shift_1 = 0;
-  int32_t output_multiplier_2 = 0;
-  int output_shift_2 = 0;
-  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt16) {
-    double real_multiplier_1 = static_cast<double>(input->params.scale) *
-                               static_cast<double>(output->params.scale);
-    double real_multiplier_2 = static_cast<double>(input->params.scale) *
-                               static_cast<double>(alpha->params.scale) /
-                               static_cast<double>(output->params.scale);
-    QuantizeMultiplier(real_multiplier_1, &output_multiplier_1,
-                       &output_shift_1);
-    QuantizeMultiplier(real_multiplier_2, &output_multiplier_2,
-                       &output_shift_2);
-  }
+
+  return CalculatePreluParams(input, alpha, output, params);
+}
+
+TfLiteStatus PreluEval(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const PreluParams& params =
+      *(static_cast<const PreluParams*>(node->user_data));
+
+  const TfLiteEvalTensor* input = tflite::micro::GetEvalInput(context, node, 0);
+  const TfLiteEvalTensor* alpha = tflite::micro::GetEvalInput(context, node, 1);
+  TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, 0);
+
  switch (input->type) {
    case kTfLiteFloat32: {
-      BroadcastPrelu4DSlowFloat(
-          GetTensorShape(input), GetTensorData<float>(input),
-          GetTensorShape(alpha), GetTensorData<float>(alpha),
-          GetTensorShape(output), GetTensorData<float>(output));
+      BroadcastPrelu4DSlowFloat(tflite::micro::GetTensorShape(input),
+                                tflite::micro::GetTensorData<float>(input),
+                                tflite::micro::GetTensorShape(alpha),
+                                tflite::micro::GetTensorData<float>(alpha),
+                                tflite::micro::GetTensorShape(output),
+                                tflite::micro::GetTensorData<float>(output));
      return kTfLiteOk;
    } break;
    case kTfLiteUInt8: {
-      PreluParams op_params;
-      op_params.input_offset = -input->params.zero_point;
-      op_params.alpha_offset = -alpha->params.zero_point;
-      op_params.output_offset = output->params.zero_point;
-      op_params.output_multiplier_1 = output_multiplier_1;
-      op_params.output_shift_1 = output_shift_1;
-      op_params.output_multiplier_2 = output_multiplier_2;
-      op_params.output_shift_2 = output_shift_2;
      reference_ops::BroadcastPrelu4DSlow(
-          op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
-          GetTensorShape(alpha), GetTensorData<uint8_t>(alpha),
-          GetTensorShape(output), GetTensorData<uint8_t>(output));
+          params, tflite::micro::GetTensorShape(input),
+          tflite::micro::GetTensorData<uint8_t>(input),
+          tflite::micro::GetTensorShape(alpha),
+          tflite::micro::GetTensorData<uint8_t>(alpha),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<uint8_t>(output));
+      return kTfLiteOk;
+    } break;
+    case kTfLiteInt8: {
+      reference_ops::BroadcastPrelu4DSlow(
+          params, tflite::micro::GetTensorShape(input),
+          tflite::micro::GetTensorData<int8_t>(input),
+          tflite::micro::GetTensorShape(alpha),
+          tflite::micro::GetTensorData<int8_t>(alpha),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<int8_t>(output));
      return kTfLiteOk;
    } break;
    default:
      TF_LITE_KERNEL_LOG(
-          context, "Only float32 and uint8 are supported currently, got %d.",
+          context, "Only float32 and uint8_t are supported currently, got %d.",
          TfLiteTypeGetName(input->type));
      return kTfLiteError;
  }
@ -112,16 +150,15 @@ TfLiteStatus PreluEval(TfLiteContext* context, TfLiteNode* node) {

 }  // namespace activations

-TfLiteRegistration* Register_PRELU() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
-                                 /*free=*/nullptr,
-                                 /*prepare=*/activations::PreluPrepare,
-                                 /*invoke=*/activations::PreluEval,
-                                 /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
-                                 /*version=*/0};
-  return &r;
+TfLiteRegistration Register_PRELU() {
+  return {/*init=*/activations::PreluInit,
+          /*free=*/nullptr,
+          /*prepare=*/activations::PreluPrepare,
+          /*invoke=*/activations::PreluEval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
 }

 }  // namespace micro
--- a/libs/tensorflow/tensorflow/lite/micro/kernels/quantize.cc
+++ b/libs/tensorflow/tensorflow/lite/micro/kernels/quantize.cc
@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/reference/requantize.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/micro_utils.h"

 namespace tflite {
@ -27,20 +28,18 @@ namespace micro {
 namespace quantize {

 struct OpData {
+  tflite::QuantizationParams quantization_params;
  // The scaling factor from input to output (aka the 'real multiplier') can
  // be represented as a fixed point multiplier plus a left shift.
  int32_t output_multiplier;
  int output_shift;
+
+  int32_t input_zero_point;
 };

 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
-  void* data = nullptr;
-  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
-      kTfLiteError) {
-    return nullptr;
-  }
-  return data;
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
 }

 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
@ -63,18 +62,27 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  TF_LITE_ENSURE(context, affine_quantization->scale);
  TF_LITE_ENSURE(context, affine_quantization->scale->size == 1);

-  TF_LITE_ENSURE(context,
-                 input->type == kTfLiteFloat32 || input->type == kTfLiteInt16);
-  TF_LITE_ENSURE(context,
-                 output->type == kTfLiteUInt8 || output->type == kTfLiteInt8);
+  TF_LITE_ENSURE(context, input->type == kTfLiteFloat32 ||
+                              input->type == kTfLiteInt16 ||
+                              input->type == kTfLiteInt8);
+  TF_LITE_ENSURE(context, output->type == kTfLiteUInt8 ||
+                              output->type == kTfLiteInt8 ||
+                              output->type == kTfLiteInt16);

-  if (input->type == kTfLiteInt16 && output->type == kTfLiteInt8) {
+  if (((input->type == kTfLiteInt16 || input->type == kTfLiteInt8) &&
+       output->type == kTfLiteInt8) ||
+      (input->type == kTfLiteInt16 && output->type == kTfLiteInt16)) {
    double effective_scale =
        static_cast<double>(input->params.scale / output->params.scale);

    QuantizeMultiplier(effective_scale, &data->output_multiplier,
                       &data->output_shift);
  }
+
+  data->quantization_params.zero_point = output->params.zero_point;
+  data->quantization_params.scale = static_cast<double>(output->params.scale);
+
+  data->input_zero_point = input->params.zero_point;
  return kTfLiteOk;
 }

@ -82,25 +90,32 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
  TFLITE_DCHECK(node->user_data != nullptr);
  OpData* data = static_cast<OpData*>(node->user_data);

-  const TfLiteTensor* input = GetInput(context, node, 0);
-  TfLiteTensor* output = GetOutput(context, node, 0);
-
-  tflite::QuantizationParams op_params;
-  op_params.zero_point = output->params.zero_point;
-  op_params.scale = static_cast<double>(output->params.scale);
+  const TfLiteEvalTensor* input = tflite::micro::GetEvalInput(context, node, 0);
+  TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, 0);

  if (input->type == kTfLiteFloat32) {
    switch (output->type) {
      case kTfLiteInt8:
        reference_ops::AffineQuantize(
-            op_params, GetTensorShape(input), GetTensorData<float>(input),
-            GetTensorShape(output), GetTensorData<int8_t>(output));
+            data->quantization_params, tflite::micro::GetTensorShape(input),
+            tflite::micro::GetTensorData<float>(input),
+            tflite::micro::GetTensorShape(output),
+            tflite::micro::GetTensorData<int8_t>(output));
        break;
      case kTfLiteUInt8:
        reference_ops::AffineQuantize(
-            op_params, GetTensorShape(input), GetTensorData<float>(input),
-            GetTensorShape(output), GetTensorData<uint8_t>(output));
+            data->quantization_params, tflite::micro::GetTensorShape(input),
+            tflite::micro::GetTensorData<float>(input),
+            tflite::micro::GetTensorShape(output),
+            tflite::micro::GetTensorData<uint8_t>(output));
        break;
+      case kTfLiteInt16:
+        reference_ops::AffineQuantize(
+            data->quantization_params, tflite::micro::GetTensorShape(input),
+            tflite::micro::GetTensorData<float>(input),
+            tflite::micro::GetTensorShape(output),
+            tflite::micro::GetTensorData<int16_t>(output));
+        return kTfLiteOk;
      default:
        TF_LITE_KERNEL_LOG(context, "Input %s, output %s not supported.",
                           TfLiteTypeGetName(input->type),
@ -111,10 +126,36 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
    size_t size = ElementCount(*input->dims);
    switch (output->type) {
      case kTfLiteInt8:
+        reference_ops::Requantize(tflite::micro::GetTensorData<int16_t>(input),
+                                  size, data->output_multiplier,
+                                  data->output_shift, data->input_zero_point,
+                                  data->quantization_params.zero_point,
+                                  tflite::micro::GetTensorData<int8_t>(output));
+        break;
+      case kTfLiteInt16:
        reference_ops::Requantize(
-            GetTensorData<int16_t>(input), size, data->output_multiplier,
-            data->output_shift, input->params.zero_point,
-            output->params.zero_point, GetTensorData<int8_t>(output));
+            tflite::micro::GetTensorData<int16_t>(input), size,
+            data->output_multiplier, data->output_shift, data->input_zero_point,
+            data->quantization_params.zero_point,
+            tflite::micro::GetTensorData<int16_t>(output));
+        return kTfLiteOk;
+      default:
+        TF_LITE_KERNEL_LOG(context, "Input %s, output %s not supported.",
+                           TfLiteTypeGetName(input->type),
+                           TfLiteTypeGetName(output->type));
+        return kTfLiteError;
+    }
+  } else if (input->type == kTfLiteInt8) {
+    // Int8 to Int8 requantization, required if the input and output tensors
+    // have different scales and/or zero points.
+    size_t size = ElementCount(*input->dims);
+    switch (output->type) {
+      case kTfLiteInt8:
+        reference_ops::Requantize(tflite::micro::GetTensorData<int8_t>(input),
+                                  size, data->output_multiplier,
+                                  data->output_shift, data->input_zero_point,
+                                  data->quantization_params.zero_point,
+                                  tflite::micro::GetTensorData<int8_t>(output));
        break;
      default:
        TF_LITE_KERNEL_LOG(context, "Input %s, output %s not supported.",
@ -136,17 +177,16 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {

 // This Op (QUANTIZE) quantizes the input and produces quantized output.
 // AffineQuantize takes scale and zero point and quantizes the float value to
-// quantized output, in int8 or uint8 format.
-TfLiteRegistration* Register_QUANTIZE() {
-  static TfLiteRegistration r = {/*init=*/quantize::Init,
-                                 /*free=*/nullptr,
-                                 /*prepare=*/quantize::Prepare,
-                                 /*invoke=*/quantize::Eval,
-                                 /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
-                                 /*version=*/0};
-  return &r;
+// quantized output, in int8_t or uint8_t format.
+TfLiteRegistration Register_QUANTIZE() {
+  return {/*init=*/quantize::Init,
+          /*free=*/nullptr,
+          /*prepare=*/quantize::Prepare,
+          /*invoke=*/quantize::Eval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
 }

 }  // namespace micro
--- a/libs/tensorflow/tensorflow/lite/micro/kernels/reduce.cc
+++ b/libs/tensorflow/tensorflow/lite/micro/kernels/reduce.cc
@ -21,6 +21,8 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/micro_utils.h"

 namespace tflite {
 namespace ops {
@ -50,7 +52,7 @@ TfLiteStatus PrepareSimple(TfLiteContext* context, TfLiteNode* node) {

 TfLiteStatus PrepareMeanOrSum(TfLiteContext* context, TfLiteNode* node) {
  TF_LITE_ENSURE_OK(context, PrepareSimple(context, node));
-  // TODO(b/144955155): Support uint8(b/144955155) and int8(b/144955018)
+  // TODO(b/144955155): Support uint8_t(b/144955155) and int8_t(b/144955018)
  return kTfLiteOk;
 }

@ -58,7 +60,7 @@ void ResolveAxis(const int* axis_data, int axis_count,
                 tflite::MeanParams* op_params) {
  int i = 0;
  for (; i < axis_count; ++i) {
-    op_params->axis[i] = static_cast<int16>(axis_data[i]);
+    op_params->axis[i] = static_cast<int16_t>(axis_data[i]);
  }
  for (; i < 4; ++i) {
    op_params->axis[i] = 1;
@ -67,24 +69,25 @@ void ResolveAxis(const int* axis_data, int axis_count,
 }

 TfLiteStatus EvalMean(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, 0);
-  const TfLiteTensor* axis = GetInput(context, node, 1);
-  TfLiteTensor* output = GetOutput(context, node, 0);
+  const TfLiteEvalTensor* input = tflite::micro::GetEvalInput(context, node, 0);
+  const TfLiteEvalTensor* axis = tflite::micro::GetEvalInput(context, node, 1);
+  TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, 0);
  TfLiteReducerParams* params =
      reinterpret_cast<TfLiteReducerParams*>(node->builtin_data);

-  int num_axis = static_cast<int>(NumElements(axis));
+  int num_axis = static_cast<int>(ElementCount(*axis->dims));
  int temp_index[kMaxNumberOfAxis];
  int resolved_axis[kMaxNumberOfReducedAxis];

  switch (input->type) {
    case kTfLiteFloat32: {
      tflite::MeanParams op_params;
-      ResolveAxis(GetTensorData<int>(axis), num_axis, &op_params);
+      ResolveAxis(tflite::micro::GetTensorData<int>(axis), num_axis,
+                  &op_params);
      // TODO(b/146571391): Support only 4D Input and 2D Axis for Mean until
      // scratch tensor allocation has been implemented in (b/132070898)
      bool is_valid_inputs =
-          (NumDimensions(input) == 4 && op_params.axis_count == 2 &&
+          (input->dims->size == 4 && op_params.axis_count == 2 &&
           ((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
            (op_params.axis[0] == 2 && op_params.axis[1] == 1)));
      TF_LITE_ENSURE_MSG(
@ -95,22 +98,24 @@ TfLiteStatus EvalMean(TfLiteContext* context, TfLiteNode* node) {
      // reference method.
      // Defer to specialized implementation for 4D Mean across axes 1 & 2.
      if (params->keep_dims) {
-        reference_ops::Mean(op_params, GetTensorShape(input),
-                            GetTensorData<float>(input), GetTensorShape(output),
-                            GetTensorData<float>(output));
+        reference_ops::Mean(op_params, tflite::micro::GetTensorShape(input),
+                            tflite::micro::GetTensorData<float>(input),
+                            tflite::micro::GetTensorShape(output),
+                            tflite::micro::GetTensorData<float>(output));
      } else {
        TF_LITE_ENSURE(
            context,
-            reference_ops::Mean(GetTensorData<float>(input), input->dims->data,
-                                input->dims->size, GetTensorData<float>(output),
-                                output->dims->data, output->dims->size,
-                                GetTensorData<int>(axis), num_axis,
-                                params->keep_dims, temp_index, resolved_axis,
-                                GetTensorData<float>(output)));
+            reference_ops::Mean(
+                tflite::micro::GetTensorData<float>(input), input->dims->data,
+                input->dims->size, tflite::micro::GetTensorData<float>(output),
+                output->dims->data, output->dims->size,
+                tflite::micro::GetTensorData<int>(axis), num_axis,
+                params->keep_dims, temp_index, resolved_axis,
+                tflite::micro::GetTensorData<float>(output)));
      }
    } break;
    default:
-      // TODO(b/144955155): Support uint8(b/144955155) and int8(b/144955018)
+      // TODO(b/144955155): Support uint8_t(b/144955155) and int8_t(b/144955018)
      TF_LITE_ENSURE_MSG(context, false,
                         "Currently, only float32 input type "
                         "is supported.");
@ -119,16 +124,15 @@ TfLiteStatus EvalMean(TfLiteContext* context, TfLiteNode* node) {
 }
 }  // namespace reduce

-TfLiteRegistration* Register_MEAN() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
-                                 /*free=*/nullptr,
-                                 /*prepare=*/reduce::PrepareMeanOrSum,
-                                 /*invoke=*/reduce::EvalMean,
-                                 /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
-                                 /*version=*/0};
-  return &r;
+TfLiteRegistration Register_MEAN() {
+  return {/*init=*/nullptr,
+          /*free=*/nullptr,
+          /*prepare=*/reduce::PrepareMeanOrSum,
+          /*invoke=*/reduce::EvalMean,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
 }
 }  // namespace micro
 }  // namespace ops
--- a/libs/tensorflow/tensorflow/lite/micro/kernels/reshape.cc
+++ b/libs/tensorflow/tensorflow/lite/micro/kernels/reshape.cc
@ -18,6 +18,9 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/memory_helpers.h"
+#include "tensorflow/lite/micro/micro_utils.h"

 namespace tflite {
 namespace ops {
@ -61,7 +64,7 @@ TfLiteStatus ReshapeOutput(TfLiteContext* context, TfLiteNode* node) {
    num_output_elements *= output_shape->data[stretch_dim];
  }

-  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
  TF_LITE_ENSURE_EQ(context, num_input_elements, num_output_elements);
  return kTfLiteOk;
 }
@ -74,13 +77,21 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 }

 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+
+  // TODO(b/162522304): storing input bytes in OpData increases some models
+  // significantly, possibly due to alignment issues.
+  size_t input_bytes;
+  TF_LITE_ENSURE_STATUS(TfLiteTypeSizeOf(input->type, &input_bytes));
+  input_bytes *= ElementCount(*input->dims);

  // Do nothing for in-place reshape.
  if (input->data.raw != output->data.raw) {
    // Otherwise perform reshape with copy.
-    for (size_t i = 0; i < input->bytes; ++i) {
+    for (size_t i = 0; i < input_bytes; ++i) {
      output->data.raw[i] = input->data.raw[i];
    }
  }
@ -89,16 +100,15 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {

 }  // namespace reshape

-TfLiteRegistration* Register_RESHAPE() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
-                                 /*free=*/nullptr,
-                                 /*prepare=*/reshape::Prepare,
-                                 /*invoke=*/reshape::Eval,
-                                 /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
-                                 /*version=*/0};
-  return &r;
+TfLiteRegistration Register_RESHAPE() {
+  return {/*init=*/nullptr,
+          /*free=*/nullptr,
+          /*prepare=*/reshape::Prepare,
+          /*invoke=*/reshape::Eval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
 }

 }  // namespace micro
--- a/libs/tensorflow/tensorflow/lite/micro/kernels/resize_nearest_neighbor.cc
+++ b/libs/tensorflow/tensorflow/lite/micro/kernels/resize_nearest_neighbor.cc
@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"

 namespace tflite {
 namespace ops {
@ -49,8 +50,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  output->type = input->type;

  if (!IsConstantTensor(size)) {
-    TF_LITE_KERNEL_LOG(context,
-                         "Dynamic tensors are unsupported in tfmicro.");
+    TF_LITE_KERNEL_LOG(context, "Dynamic tensors are unsupported in tfmicro.");
    return kTfLiteError;
  }
 #endif
@ -61,9 +61,12 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
  auto* params =
      reinterpret_cast<TfLiteResizeNearestNeighborParams*>(node->builtin_data);

-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* size = GetInput(context, node, kSizeTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  const TfLiteEvalTensor* size =
+      tflite::micro::GetEvalInput(context, node, kSizeTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);

  tflite::ResizeNearestNeighborParams op_params;
  op_params.align_corners = params->align_corners;
@ -71,22 +74,31 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {

  if (output->type == kTfLiteFloat32) {
    reference_ops::ResizeNearestNeighbor(
-        op_params, GetTensorShape(input), GetTensorData<int32>(input),
-        GetTensorShape(size), GetTensorData<int32>(size),
-        GetTensorShape(output), GetTensorData<int32>(output));
+        op_params, tflite::micro::GetTensorShape(input),
+        tflite::micro::GetTensorData<int32_t>(input),
+        tflite::micro::GetTensorShape(size),
+        tflite::micro::GetTensorData<int32_t>(size),
+        tflite::micro::GetTensorShape(output),
+        tflite::micro::GetTensorData<int32_t>(output));
  } else if (output->type == kTfLiteUInt8) {
    reference_ops::ResizeNearestNeighbor(
-        op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
-        GetTensorShape(size), GetTensorData<int32>(size),
-        GetTensorShape(output), GetTensorData<uint8_t>(output));
+        op_params, tflite::micro::GetTensorShape(input),
+        tflite::micro::GetTensorData<uint8_t>(input),
+        tflite::micro::GetTensorShape(size),
+        tflite::micro::GetTensorData<int32_t>(size),
+        tflite::micro::GetTensorShape(output),
+        tflite::micro::GetTensorData<uint8_t>(output));
  } else if (output->type == kTfLiteInt8) {
    reference_ops::ResizeNearestNeighbor(
-        op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
-        GetTensorShape(size), GetTensorData<int32>(size),
-        GetTensorShape(output), GetTensorData<int8_t>(output));
+        op_params, tflite::micro::GetTensorShape(input),
+        tflite::micro::GetTensorData<int8_t>(input),
+        tflite::micro::GetTensorShape(size),
+        tflite::micro::GetTensorData<int32_t>(size),
+        tflite::micro::GetTensorShape(output),
+        tflite::micro::GetTensorData<int8_t>(output));
  } else {
    TF_LITE_KERNEL_LOG(context,
-                       "Output type is %d, requires float, uint8 or int8.",
+                       "Output type is %d, requires float, uint8_t or int8_t.",
                       output->type);
    return kTfLiteError;
  }
@ -95,16 +107,15 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }
 }  // namespace resize_nearest_neighbor

-TfLiteRegistration* Register_RESIZE_NEAREST_NEIGHBOR() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
-                                 /*free=*/nullptr,
-                                 /*prepare=*/resize_nearest_neighbor::Prepare,
-                                 /*invoke=*/resize_nearest_neighbor::Eval,
-                                 /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
-                                 /*version=*/0};
-  return &r;
+TfLiteRegistration Register_RESIZE_NEAREST_NEIGHBOR() {
+  return {/*init=*/nullptr,
+          /*free=*/nullptr,
+          /*prepare=*/resize_nearest_neighbor::Prepare,
+          /*invoke=*/resize_nearest_neighbor::Eval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
 }

 }  // namespace micro
--- a/libs/tensorflow/tensorflow/lite/micro/kernels/round.cc
+++ b/libs/tensorflow/tensorflow/lite/micro/kernels/round.cc
@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"

 namespace tflite {
 namespace ops {
@ -32,8 +33,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
-  TF_LITE_ENSURE_EQ(context, output->type, input->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteFloat32);
+  TF_LITE_ENSURE_TYPES_EQ(context, output->type, input->type);
  TF_LITE_ENSURE_EQ(context, output->bytes, input->bytes);
  TF_LITE_ENSURE_EQ(context, output->dims->size, input->dims->size);
  for (int i = 0; i < output->dims->size; ++i) {
@ -43,26 +44,29 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 }

 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);

-  reference_ops::Round(GetTensorShape(input), GetTensorData<float>(input),
-                       GetTensorShape(output), GetTensorData<float>(output));
+  reference_ops::Round(tflite::micro::GetTensorShape(input),
+                       tflite::micro::GetTensorData<float>(input),
+                       tflite::micro::GetTensorShape(output),
+                       tflite::micro::GetTensorData<float>(output));

  return kTfLiteOk;
 }
 }  // namespace round

-TfLiteRegistration* Register_ROUND() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
-                                 /*free=*/nullptr,
-                                 /*prepare=*/round::Prepare,
-                                 /*invoke=*/round::Eval,
-                                 /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
-                                 /*version=*/0};
-  return &r;
+TfLiteRegistration Register_ROUND() {
+  return {/*init=*/nullptr,
+          /*free=*/nullptr,
+          /*prepare=*/round::Prepare,
+          /*invoke=*/round::Eval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
 }

 }  // namespace micro
--- a/libs/tensorflow/tensorflow/lite/micro/kernels/softmax.cc
+++ b/libs/tensorflow/tensorflow/lite/micro/kernels/softmax.cc
@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"

 namespace tflite {
 namespace ops {
@ -42,7 +43,8 @@ TfLiteStatus CalculateSoftmaxParams(TfLiteContext* context,
      TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteInt8);
      if (output->type == kTfLiteInt16) {
        TF_LITE_ENSURE_EQ(context, output->params.zero_point, -32768);
-        // NOTE: Current int16 softmax output does not require symmetric scaling
+        // NOTE: Current int16_t softmax output does not require symmetric
+        // scaling
        // - so no need to verify scale here.
      } else {
        TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteInt8);
@ -72,60 +74,75 @@ TfLiteStatus CalculateSoftmaxParams(TfLiteContext* context,

 }  // namespace

+// Takes a tensor and performs softmax along the last dimension.
+void SoftmaxFloat(const TfLiteEvalTensor* input, TfLiteEvalTensor* output,
+                  const SoftmaxParams& op_data) {
+  tflite::reference_ops::Softmax(op_data, tflite::micro::GetTensorShape(input),
+                                 tflite::micro::GetTensorData<float>(input),
+                                 tflite::micro::GetTensorShape(output),
+                                 tflite::micro::GetTensorData<float>(output));
+}
+
+void SoftmaxQuantized(const TfLiteEvalTensor* input, TfLiteEvalTensor* output,
+                      const SoftmaxParams& op_data) {
+  if (input->type == kTfLiteUInt8) {
+    tflite::reference_ops::Softmax(
+        op_data, tflite::micro::GetTensorShape(input),
+        tflite::micro::GetTensorData<uint8_t>(input),
+        tflite::micro::GetTensorShape(output),
+        tflite::micro::GetTensorData<uint8_t>(output));
+  } else {
+    if (output->type == kTfLiteInt16) {
+      tflite::reference_ops::Softmax(
+          op_data, tflite::micro::GetTensorShape(input),
+          tflite::micro::GetTensorData<int8_t>(input),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<int16_t>(output));
+    } else {
+      tflite::reference_ops::Softmax(
+          op_data, tflite::micro::GetTensorShape(input),
+          tflite::micro::GetTensorData<int8_t>(input),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<int8_t>(output));
+    }
+  }
+}
+
+void* SoftmaxInit(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(SoftmaxParams));
+}
+
 TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = static_cast<TfLiteSoftmaxParams*>(node->builtin_data);
+
  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
  const TfLiteTensor* input = GetInput(context, node, 0);
  TF_LITE_ENSURE(context, NumDimensions(input) >= 1);

-  return kTfLiteOk;
-}
+  TfLiteTensor* output = GetOutput(context, node, 0);

-// Takes a tensor and performs softmax along the last dimension.
-void SoftmaxFloat(const TfLiteTensor* input, TfLiteTensor* output,
-                  const SoftmaxParams& op_data) {
-  tflite::reference_ops::Softmax(
-      op_data, GetTensorShape(input), GetTensorData<float>(input),
-      GetTensorShape(output), GetTensorData<float>(output));
-}
-
-void SoftmaxQuantized(const TfLiteTensor* input, TfLiteTensor* output,
-                      const SoftmaxParams& op_data) {
-  if (input->type == kTfLiteUInt8) {
-    tflite::reference_ops::Softmax(
-        op_data, GetTensorShape(input), GetTensorData<uint8_t>(input),
-        GetTensorShape(output), GetTensorData<uint8_t>(output));
-  } else {
-    if (output->type == kTfLiteInt16) {
-      tflite::reference_ops::Softmax(
-          op_data, GetTensorShape(input), GetTensorData<int8_t>(input),
-          GetTensorShape(output), GetTensorData<int16_t>(output));
-    } else {
-      tflite::reference_ops::Softmax(
-          op_data, GetTensorShape(input), GetTensorData<int8_t>(input),
-          GetTensorShape(output), GetTensorData<int8_t>(output));
-    }
-  }
+  TFLITE_DCHECK(node->user_data != nullptr);
+  SoftmaxParams* data = static_cast<SoftmaxParams*>(node->user_data);
+  return CalculateSoftmaxParams(context, input, output, params, data);
 }

 TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
-  auto* params = static_cast<TfLiteSoftmaxParams*>(node->builtin_data);
+  const TfLiteEvalTensor* input = tflite::micro::GetEvalInput(context, node, 0);
+  TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, 0);

-  const TfLiteTensor* input = GetInput(context, node, 0);
-  TfLiteTensor* output = GetOutput(context, node, 0);
-
-  SoftmaxParams op_data;
-  TF_LITE_ENSURE_STATUS(
-      CalculateSoftmaxParams(context, input, output, params, &op_data));
+  TFLITE_DCHECK(node->user_data != nullptr);
+  SoftmaxParams* data = static_cast<SoftmaxParams*>(node->user_data);

  switch (input->type) {
    case kTfLiteFloat32: {
-      SoftmaxFloat(input, output, op_data);
+      SoftmaxFloat(input, output, *data);
      return kTfLiteOk;
    }
    case kTfLiteInt8:
    case kTfLiteUInt8: {
-      SoftmaxQuantized(input, output, op_data);
+      SoftmaxQuantized(input, output, *data);
      return kTfLiteOk;
    }
    default:
@ -136,16 +153,15 @@ TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
 }
 }  // namespace activations

-TfLiteRegistration* Register_SOFTMAX() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
-                                 /*free=*/nullptr,
-                                 /*prepare=*/activations::SoftmaxPrepare,
-                                 /*invoke=*/activations::SoftmaxEval,
-                                 /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
-                                 /*version=*/0};
-  return &r;
+TfLiteRegistration Register_SOFTMAX() {
+  return {/*init=*/activations::SoftmaxInit,
+          /*free=*/nullptr,
+          /*prepare=*/activations::SoftmaxPrepare,
+          /*invoke=*/activations::SoftmaxEval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
 }

 }  // namespace micro
--- a/libs/tensorflow/tensorflow/lite/micro/kernels/split.cc
+++ b/libs/tensorflow/tensorflow/lite/micro/kernels/split.cc
@ -17,6 +17,7 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"

 namespace tflite {
 namespace ops {
@ -25,10 +26,11 @@ namespace split {

 template <typename T>
 TfLiteStatus SplitImpl(TfLiteContext* context, TfLiteNode* node,
-                       const TfLiteTensor* input, int axis_value) {
+                       const TfLiteEvalTensor* input, int axis_value) {
  const int output_count = NumOutputs(node);
  const TfLiteIntArray* input_dims = input->dims;
-  const TfLiteTensor* output0 = GetOutput(context, node, 0);
+  const TfLiteEvalTensor* output0 =
+      tflite::micro::GetEvalOutput(context, node, 0);
  const TfLiteIntArray* output_dims = output0->dims;

  const int split_dimensions = input_dims->size;
@ -50,11 +52,11 @@ TfLiteStatus SplitImpl(TfLiteContext* context, TfLiteNode* node,
    base_inner_size *= input_dims->data[i];
  }

-  const T* input_ptr = GetTensorData<T>(input);
+  const T* input_ptr = tflite::micro::GetTensorData<T>(input);
  for (int k = 0; k < outer_size; ++k) {
    for (int i = 0; i < output_count; ++i) {
-      TfLiteTensor* t = GetOutput(context, node, i);
-      T* output_data = GetTensorData<T>(t);
+      TfLiteEvalTensor* t = tflite::micro::GetEvalOutput(context, node, i);
+      T* output_data = tflite::micro::GetTensorData<T>(t);
      const int copy_size = output_dims->data[axis] * base_inner_size;
      T* output_ptr = output_data + k * copy_size;
      for (int j = 0; j < copy_size; ++j) output_ptr[j] = input_ptr[j];
@ -65,23 +67,28 @@ TfLiteStatus SplitImpl(TfLiteContext* context, TfLiteNode* node,
  return kTfLiteOk;
 }

-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  const TfLiteTensor* axis = GetInput(context, node, 0);
-  const TfLiteTensor* input = GetInput(context, node, 1);

  // Dynamic output tensors are needed if axis tensor is not constant.
  // But Micro doesn't support dynamic memory allocation, so we only support
  // constant axis tensor for now.
  TF_LITE_ENSURE_MSG(context, IsConstantTensor(axis),
                     "Non constant axis tensor not supported");
+  return kTfLiteOk;
+}

-  int axis_value = GetTensorData<int32_t>(axis)[0];
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteEvalTensor* axis = tflite::micro::GetEvalInput(context, node, 0);
+  const TfLiteEvalTensor* input = tflite::micro::GetEvalInput(context, node, 1);
+
+  int axis_value = tflite::micro::GetTensorData<int32_t>(axis)[0];
  if (axis_value < 0) {
-    axis_value += NumDimensions(input);
+    axis_value += input->dims->size;
  }

  TF_LITE_ENSURE(context, axis_value >= 0);
-  TF_LITE_ENSURE(context, axis_value < NumDimensions(input));
+  TF_LITE_ENSURE(context, axis_value < input->dims->size);

  switch (input->type) {
    case kTfLiteFloat32: {
@ -111,16 +118,15 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {

 }  // namespace split

-TfLiteRegistration* Register_SPLIT() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
-                                 /*free=*/nullptr,
-                                 /*prepare=*/nullptr,
-                                 /*invoke=*/split::Eval,
-                                 /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
-                                 /*version=*/0};
-  return &r;
+TfLiteRegistration Register_SPLIT() {
+  return {/*init=*/nullptr,
+          /*free=*/nullptr,
+          /*prepare=*/split::Prepare,
+          /*invoke=*/split::Eval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
 }

 }  // namespace micro
--- a/libs/tensorflow/tensorflow/lite/micro/kernels/strided_slice.cc
+++ b/libs/tensorflow/tensorflow/lite/micro/kernels/strided_slice.cc
@ -15,23 +15,20 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/reference/strided_slice.h"

 #include <cmath>
+#include <cstring>

 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"

 namespace tflite {
 namespace ops {
 namespace micro {
 namespace strided_slice {

-enum KernelType {
-  kReference,
-  // TODO(soroosh): add kGenericOptimized
-};
-
 constexpr int kInputTensor = 0;
 constexpr int kBeginTensor = 1;
 constexpr int kEndTensor = 2;
@ -120,64 +117,74 @@ TfLiteStatus CheckOutputSize(TfLiteContext* context,
  return kTfLiteOk;
 }

+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(StridedSliceParams));
+}
+
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  StridedSliceParams* op_params =
+      static_cast<StridedSliceParams*>(node->user_data);
  TF_LITE_ENSURE_EQ(context, NumInputs(node), 4);
  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
  StridedSliceContext op_context(context, node);
  TF_LITE_ENSURE_MSG(context, op_context.dims <= kMaxDim,
                     "input dim should not exceed 4");
+  auto params = BuildStridedSliceParams(&op_context);
+  memcpy(op_params, &params, sizeof(StridedSliceParams));
  return CheckOutputSize(context, &op_context);
 }

-template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  StridedSliceContext op_context(context, node);
-  auto op_params = BuildStridedSliceParams(&op_context);
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const StridedSliceParams& op_params =
+      *(static_cast<const StridedSliceParams*>(node->user_data));

-#define TF_LITE_STRIDED_SLICE(kernel_type, data_type)                    \
-  kernel_type::StridedSlice(op_params, GetTensorShape(op_context.input), \
-                            GetTensorData<data_type>(op_context.input),  \
-                            GetTensorShape(op_context.output),           \
-                            GetTensorData<data_type>(op_context.output))
-
-  switch (op_context.input->type) {
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+  switch (output->type) {
    case kTfLiteFloat32:
-      if (kernel_type == kReference) {
-        TF_LITE_STRIDED_SLICE(reference_ops, float);
-      }
+      reference_ops::StridedSlice(op_params,
+                                  tflite::micro::GetTensorShape(input),
+                                  tflite::micro::GetTensorData<float>(input),
+                                  tflite::micro::GetTensorShape(output),
+                                  tflite::micro::GetTensorData<float>(output));
      break;
    case kTfLiteUInt8:
-      if (kernel_type == kReference) {
-        TF_LITE_STRIDED_SLICE(reference_ops, uint8_t);
-      }
+      reference_ops::StridedSlice(
+          op_params, tflite::micro::GetTensorShape(input),
+          tflite::micro::GetTensorData<uint8_t>(input),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<uint8_t>(output));
      break;
    case kTfLiteInt8:
-      if (kernel_type == kReference) {
-        TF_LITE_STRIDED_SLICE(reference_ops, int8_t);
-      }
+      reference_ops::StridedSlice(op_params,
+                                  tflite::micro::GetTensorShape(input),
+                                  tflite::micro::GetTensorData<int8_t>(input),
+                                  tflite::micro::GetTensorShape(output),
+                                  tflite::micro::GetTensorData<int8_t>(output));
      break;
    default:
      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
-                         TfLiteTypeGetName(op_context.input->type),
-                         op_context.input->type);
+                         TfLiteTypeGetName(input->type), input->type);
      return kTfLiteError;
  }
-#undef TF_LITE_STRIDED_SLICE
  return kTfLiteOk;
 }
 }  // namespace strided_slice

-TfLiteRegistration* Register_STRIDED_SLICE() {
-  static TfLiteRegistration r = {
-      /*init=*/nullptr,
-      /*free=*/nullptr,
-      /*prepare=*/strided_slice::Prepare,
-      /*invoke=*/strided_slice::Eval<strided_slice::kReference>,
-      /*profiling_string=*/nullptr,
-      /*builtin_code=*/0,
-      /*custom_name=*/nullptr,
-      /*version=*/0};
-  return &r;
+TfLiteRegistration Register_STRIDED_SLICE() {
+  return {/*init=*/strided_slice::Init,
+          /*free=*/nullptr,
+          /*prepare=*/strided_slice::Prepare,
+          /*invoke=*/strided_slice::Eval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
 }

 }  // namespace micro
--- a/libs/tensorflow/tensorflow/lite/micro/kernels/sub.cc
+++ b/libs/tensorflow/tensorflow/lite/micro/kernels/sub.cc
@ -21,8 +21,10 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"

 namespace tflite {
 namespace ops {
@ -40,18 +42,18 @@ struct OpData {
  // and the special 16-bit -> 16bit quantized path
  int input1_shift;
  int input2_shift;
-  int32 output_activation_min;
-  int32 output_activation_max;
+  int32_t output_activation_min;
+  int32_t output_activation_max;

  // These fields are used only in the general 8-bit -> 8bit quantized path
-  int32 input1_multiplier;
-  int32 input2_multiplier;
-  int32 output_multiplier;
+  int32_t input1_multiplier;
+  int32_t input2_multiplier;
+  int32_t output_multiplier;
  int output_shift;
  int left_shift;
-  int32 input1_offset;
-  int32 input2_offset;
-  int32 output_offset;
+  int32_t input1_offset;
+  int32_t input2_offset;
+  int32_t output_offset;
 };

 TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteSubParams* params,
@ -93,31 +95,59 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteSubParams* params,
  return kTfLiteOk;
 }

+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
+  OpData* data = static_cast<OpData*>(node->user_data);
+  auto* params = reinterpret_cast<TfLiteSubParams*>(node->builtin_data);
+
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  TF_LITE_ENSURE_STATUS(
+      CalculateOpData(context, params, input1, input2, output, data));
+  return kTfLiteOk;
+}
+
 void EvalSub(TfLiteContext* context, TfLiteNode* node, TfLiteSubParams* params,
-             const OpData* data, const TfLiteTensor* input1,
-             const TfLiteTensor* input2, TfLiteTensor* output) {
+             const OpData* data, const TfLiteEvalTensor* input1,
+             const TfLiteEvalTensor* input2, TfLiteEvalTensor* output) {
  float output_activation_min, output_activation_max;
  CalculateActivationRange(params->activation, &output_activation_min,
                           &output_activation_max);
  tflite::ArithmeticParams op_params;
  SetActivationParams(output_activation_min, output_activation_max, &op_params);
-#define TF_LITE_SUB(opname)                                               \
-  opname(op_params, GetTensorShape(input1), GetTensorData<float>(input1), \
-         GetTensorShape(input2), GetTensorData<float>(input2),            \
-         GetTensorShape(output), GetTensorData<float>(output))
  if (data->requires_broadcast) {
-    TF_LITE_SUB(tflite::reference_ops::BroadcastSubSlow);
+    tflite::reference_ops::BroadcastSubSlow(
+        op_params, tflite::micro::GetTensorShape(input1),
+        tflite::micro::GetTensorData<float>(input1),
+        tflite::micro::GetTensorShape(input2),
+        tflite::micro::GetTensorData<float>(input2),
+        tflite::micro::GetTensorShape(output),
+        tflite::micro::GetTensorData<float>(output));
  } else {
-    TF_LITE_SUB(tflite::reference_ops::SubWithActivation);
+    tflite::reference_ops::SubWithActivation(
+        op_params, tflite::micro::GetTensorShape(input1),
+        tflite::micro::GetTensorData<float>(input1),
+        tflite::micro::GetTensorShape(input2),
+        tflite::micro::GetTensorData<float>(input2),
+        tflite::micro::GetTensorShape(output),
+        tflite::micro::GetTensorData<float>(output));
  }
-#undef TF_LITE_SUB
 }

 TfLiteStatus EvalSubQuantized(TfLiteContext* context, TfLiteNode* node,
                              TfLiteSubParams* params, const OpData* data,
-                              const TfLiteTensor* input1,
-                              const TfLiteTensor* input2,
-                              TfLiteTensor* output) {
+                              const TfLiteEvalTensor* input1,
+                              const TfLiteEvalTensor* input2,
+                              TfLiteEvalTensor* output) {
  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
    tflite::ArithmeticParams op_params;
    op_params.left_shift = data->left_shift;
@ -133,25 +163,46 @@ TfLiteStatus EvalSubQuantized(TfLiteContext* context, TfLiteNode* node,
    SetActivationParams(data->output_activation_min,
                        data->output_activation_max, &op_params);
    bool need_broadcast = reference_ops::ProcessBroadcastShapes(
-        GetTensorShape(input1), GetTensorShape(input2), &op_params);
-#define TF_LITE_SUB(opname, dtype)                                        \
-  opname(op_params, GetTensorShape(input1), GetTensorData<dtype>(input1), \
-         GetTensorShape(input2), GetTensorData<dtype>(input2),            \
-         GetTensorShape(output), GetTensorData<dtype>(output));
+        tflite::micro::GetTensorShape(input1),
+        tflite::micro::GetTensorShape(input2), &op_params);
+
    if (output->type == kTfLiteInt8) {
      if (need_broadcast) {
-        TF_LITE_SUB(tflite::reference_ops::BroadcastSubSlow, int8_t);
+        tflite::reference_ops::BroadcastSubSlow(
+            op_params, tflite::micro::GetTensorShape(input1),
+            tflite::micro::GetTensorData<int8_t>(input1),
+            tflite::micro::GetTensorShape(input2),
+            tflite::micro::GetTensorData<int8_t>(input2),
+            tflite::micro::GetTensorShape(output),
+            tflite::micro::GetTensorData<int8_t>(output));
      } else {
-        TF_LITE_SUB(tflite::reference_ops::Sub, int8_t);
+        tflite::reference_ops::Sub(
+            op_params, tflite::micro::GetTensorShape(input1),
+            tflite::micro::GetTensorData<int8_t>(input1),
+            tflite::micro::GetTensorShape(input2),
+            tflite::micro::GetTensorData<int8_t>(input2),
+            tflite::micro::GetTensorShape(output),
+            tflite::micro::GetTensorData<int8_t>(output));
      }
    } else {
      if (need_broadcast) {
-        TF_LITE_SUB(tflite::reference_ops::BroadcastSubSlow, uint8_t);
+        tflite::reference_ops::BroadcastSubSlow(
+            op_params, tflite::micro::GetTensorShape(input1),
+            tflite::micro::GetTensorData<uint8_t>(input1),
+            tflite::micro::GetTensorShape(input2),
+            tflite::micro::GetTensorData<uint8_t>(input2),
+            tflite::micro::GetTensorShape(output),
+            tflite::micro::GetTensorData<uint8_t>(output));
      } else {
-        TF_LITE_SUB(tflite::reference_ops::Sub, uint8_t);
+        tflite::reference_ops::Sub(
+            op_params, tflite::micro::GetTensorShape(input1),
+            tflite::micro::GetTensorData<uint8_t>(input1),
+            tflite::micro::GetTensorShape(input2),
+            tflite::micro::GetTensorData<uint8_t>(input2),
+            tflite::micro::GetTensorShape(output),
+            tflite::micro::GetTensorData<uint8_t>(output));
      }
    }
-#undef TF_LITE_SUB
  }

  return kTfLiteOk;
@ -160,13 +211,15 @@ TfLiteStatus EvalSubQuantized(TfLiteContext* context, TfLiteNode* node,
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
  auto* params = reinterpret_cast<TfLiteSubParams*>(node->builtin_data);

-  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteEvalTensor* input1 =
+      tflite::micro::GetEvalInput(context, node, kInputTensor1);
+  const TfLiteEvalTensor* input2 =
+      tflite::micro::GetEvalInput(context, node, kInputTensor2);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);

-  OpData data;
-  TF_LITE_ENSURE_STATUS(
-      CalculateOpData(context, params, input1, input2, output, &data));
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpData& data = *(static_cast<const OpData*>(node->user_data));

  if (output->type == kTfLiteFloat32) {
    EvalSub(context, node, params, &data, input1, input2, output);
@ -184,16 +237,15 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {

 }  // namespace sub

-TfLiteRegistration* Register_SUB() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
-                                 /*free=*/nullptr,
-                                 /*prepare=*/nullptr,
-                                 /*invoke=*/sub::Eval,
-                                 /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
-                                 /*version=*/0};
-  return &r;
+TfLiteRegistration Register_SUB() {
+  return {/*init=*/sub::Init,
+          /*free=*/nullptr,
+          /*prepare=*/sub::Prepare,
+          /*invoke=*/sub::Eval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
 }

 }  // namespace micro
--- a/libs/tensorflow/tensorflow/lite/micro/kernels/svdf.cc
+++ b/libs/tensorflow/tensorflow/lite/micro/kernels/svdf.cc
@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
 #include "tensorflow/lite/micro/kernels/activation_utils.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/micro_utils.h"

 namespace tflite {
@ -32,14 +33,18 @@ namespace svdf {
 namespace {

 struct OpData {
-  int32 effective_scale_1_a;
-  int32 effective_scale_2_a;
+  int32_t effective_scale_1_a;
+  int32_t effective_scale_2_a;
  // b versions of each scale are kept at int since the numbers are just the
  // shift value - typically between [-32, 32].
  int effective_scale_1_b;
  int effective_scale_2_b;
  int scratch_tensor_index;
  int scratch_output_tensor_index;
+
+  // Cached tensor zero point values for quantized operations.
+  int input_zero_point;
+  int output_zero_point;
 };

 /**
@ -114,11 +119,11 @@ static inline void ApplyTimeWeightsBiasAndActivation(
 }

 inline void EvalFloatSVDF(
-    TfLiteContext* context, TfLiteNode* node, const TfLiteTensor* input,
-    const TfLiteTensor* weights_feature, const TfLiteTensor* weights_time,
-    const TfLiteTensor* bias, const TfLiteSVDFParams* params,
-    int scratch_tensor_index, TfLiteTensor* activation_state,
-    TfLiteTensor* output) {
+    TfLiteContext* context, TfLiteNode* node, const TfLiteEvalTensor* input,
+    const TfLiteEvalTensor* weights_feature,
+    const TfLiteEvalTensor* weights_time, const TfLiteEvalTensor* bias,
+    const TfLiteSVDFParams* params, int scratch_tensor_index,
+    TfLiteEvalTensor* activation_state, TfLiteEvalTensor* output) {
  const int rank = params->rank;
  const int batch_size = input->dims->data[0];
  const int input_size = input->dims->data[1];
@ -126,12 +131,14 @@ inline void EvalFloatSVDF(
  const int num_units = num_filters / rank;
  const int memory_size = weights_time->dims->data[1];

-  const float* weights_feature_ptr = GetTensorData<float>(weights_feature);
-  const float* weights_time_ptr = GetTensorData<float>(weights_time);
-  const float* bias_ptr = GetTensorData<float>(bias);
-  const float* input_ptr = GetTensorData<float>(input);
+  const float* weights_feature_ptr =
+      tflite::micro::GetTensorData<float>(weights_feature);
+  const float* weights_time_ptr =
+      tflite::micro::GetTensorData<float>(weights_time);
+  const float* bias_ptr = tflite::micro::GetTensorData<float>(bias);
+  const float* input_ptr = tflite::micro::GetTensorData<float>(input);

-  float* state_ptr = GetTensorData<float>(activation_state);
+  float* state_ptr = tflite::micro::GetTensorData<float>(activation_state);

  TFLITE_DCHECK(context != nullptr);
  TFLITE_DCHECK(context->GetScratchBuffer != nullptr);
@ -139,7 +146,7 @@ inline void EvalFloatSVDF(
  float* scratch_ptr = static_cast<float*>(
      context->GetScratchBuffer(context, scratch_tensor_index));

-  float* output_ptr = GetTensorData<float>(output);
+  float* output_ptr = tflite::micro::GetTensorData<float>(output);

  // Left shift the activation_state.
  {
@ -185,14 +192,13 @@ inline void EvalFloatSVDF(
 }

 void EvalIntegerSVDF(TfLiteContext* context, TfLiteNode* node,
-                     const TfLiteTensor* input_tensor,
-                     const TfLiteTensor* weights_feature_tensor,
-                     const TfLiteTensor* weights_time_tensor,
-                     const TfLiteTensor* bias_tensor,
+                     const TfLiteEvalTensor* input_tensor,
+                     const TfLiteEvalTensor* weights_feature_tensor,
+                     const TfLiteEvalTensor* weights_time_tensor,
+                     const TfLiteEvalTensor* bias_tensor,
                     const TfLiteSVDFParams* params,
-                     TfLiteTensor* activation_state_tensor,
-                     TfLiteTensor* output_tensor, const OpData& data,
-                     int32_t input_zp, int32_t output_zp) {
+                     TfLiteEvalTensor* activation_state_tensor,
+                     TfLiteEvalTensor* output_tensor, const OpData& data) {
  const int n_rank = params->rank;
  const int n_batch = input_tensor->dims->data[0];
  const int n_input = input_tensor->dims->data[1];
@ -209,7 +215,8 @@ void EvalIntegerSVDF(TfLiteContext* context, TfLiteNode* node,
      context->GetScratchBuffer(context, data.scratch_output_tensor_index));

  // Shift states.
-  int16_t* const state_ptr = GetTensorData<int16_t>(activation_state_tensor);
+  int16_t* const state_ptr =
+      tflite::micro::GetTensorData<int16_t>(activation_state_tensor);

  // Left shift the activation_state.
  {
@ -225,10 +232,11 @@ void EvalIntegerSVDF(TfLiteContext* context, TfLiteNode* node,

  // Feature matmul.
  {
-    int16_t* state = GetTensorData<int16_t>(activation_state_tensor);
-    const int8_t* input = GetTensorData<int8_t>(input_tensor);
+    int16_t* state =
+        tflite::micro::GetTensorData<int16_t>(activation_state_tensor);
+    const int8_t* input = tflite::micro::GetTensorData<int8_t>(input_tensor);
    const int8_t* weight_feature =
-        GetTensorData<int8_t>(weights_feature_tensor);
+        tflite::micro::GetTensorData<int8_t>(weights_feature_tensor);
    const int32_t output_max = std::numeric_limits<int16_t>::max();
    const int32_t output_min = std::numeric_limits<int16_t>::min();
    int16_t* result_in_batch = state + (n_memory - 1);
@ -238,7 +246,8 @@ void EvalIntegerSVDF(TfLiteContext* context, TfLiteNode* node,
        int32_t dot_prod = 0;
        const int8_t* vector_in_batch = input + b * n_input;
        for (int c = 0; c < n_input; c++) {
-          dot_prod += *matrix_ptr++ * (*vector_in_batch++ - input_zp);
+          dot_prod +=
+              *matrix_ptr++ * (*vector_in_batch++ - data.input_zero_point);
        }
        dot_prod = MultiplyByQuantizedMultiplier(
            dot_prod, data.effective_scale_1_a, data.effective_scale_1_b);
@ -261,9 +270,10 @@ void EvalIntegerSVDF(TfLiteContext* context, TfLiteNode* node,
      int32_t* scratch_ptr_batch = scratch_tensor + b * n_filter;

      // Perform batched vector dot product:
-      const int16_t* vector1_ptr = GetTensorData<int16_t>(weights_time_tensor);
+      const int16_t* vector1_ptr =
+          tflite::micro::GetTensorData<int16_t>(weights_time_tensor);
      const int16_t* vector2_ptr =
-          GetTensorData<int16_t>(activation_state_tensor) +
+          tflite::micro::GetTensorData<int16_t>(activation_state_tensor) +
          b * n_memory * n_filter;

      for (int i = 0; i < n_filter; i++) {
@ -281,7 +291,8 @@ void EvalIntegerSVDF(TfLiteContext* context, TfLiteNode* node,
    // Add bias.
    if (bias_tensor) {
      // Vector batch assign:
-      const int32_t* bias_data = GetTensorData<int32_t>(bias_tensor);
+      const int32_t* bias_data =
+          tflite::micro::GetTensorData<int32_t>(bias_tensor);
      for (int i = 0; i < n_batch; ++i) {
        int32_t* output_ptr = scratch_output_tensor + i * n_unit;
        const int32_t* bias_ptr = bias_data;
@ -316,9 +327,10 @@ void EvalIntegerSVDF(TfLiteContext* context, TfLiteNode* node,
      int32_t x1 = scratch_output_tensor[i];
      int32_t x2 = MultiplyByQuantizedMultiplier(x1, data.effective_scale_2_a,
                                                 data.effective_scale_2_b);
-      int32_t x3 = x2 + output_zp;
+      int32_t x3 = x2 + data.output_zero_point;
      int32_t x4 = std::min(std::max(output_min, x3), output_max);
-      GetTensorData<int8_t>(output_tensor)[i] = static_cast<int8_t>(x4);
+      tflite::micro::GetTensorData<int8_t>(output_tensor)[i] =
+          static_cast<int8_t>(x4);
    }
  }
 }
@ -338,12 +350,7 @@ constexpr int kOutputTensor = 0;

 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
-  void* data = nullptr;
-  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
-      kTfLiteError) {
-    return nullptr;
-  }
-  return data;
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
 }

 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
@ -382,7 +389,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  TF_LITE_ENSURE_EQ(context, NumDimensions(input), 2);

  // Validate Tensor Output:
-  // [0] = float/int8, {2, batch_size, num_units}
+  // [0] = float/int8_t, {2, batch_size, num_units}
  TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
  TF_LITE_ENSURE_EQ(context, NumDimensions(output), 2);
@ -408,9 +415,14 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  TF_LITE_ENSURE_EQ(context, activation_state->dims->data[0], batch_size);
  TF_LITE_ENSURE_EQ(context, activation_state->dims->data[1],
                    memory_size * num_filters);
+  // Since is_variable is not part of TFLiteEvalTensor, check is_variable here.
+  TF_LITE_ENSURE_EQ(context, activation_state->is_variable, true);

  TF_LITE_ENSURE_EQ(context, node->inputs->size, 5);

+  TFLITE_DCHECK(node->user_data != nullptr);
+  OpData* data = static_cast<OpData*>(node->user_data);
+
  if (input->type == kTfLiteInt8) {
    TF_LITE_ENSURE_EQ(context, weights_feature->type, kTfLiteInt8);
    TF_LITE_ENSURE_EQ(context, weights_time->type, kTfLiteInt16);
@ -419,35 +431,30 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
      TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteInt32);
    }

-    TF_LITE_ENSURE_EQ(context, output->type, kTfLiteInt8);
+    TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteInt8);

-    const auto* input_params =
-        reinterpret_cast<TfLiteAffineQuantization*>(input->quantization.params);
-    const auto* weights_feature_params =
-        static_cast<const TfLiteAffineQuantization*>(
-            weights_feature->quantization.params);
-    const auto* state_params = static_cast<const TfLiteAffineQuantization*>(
-        activation_state->quantization.params);
-    const auto* weight_time_params =
-        static_cast<const TfLiteAffineQuantization*>(
-            weights_time->quantization.params);
-    const auto* output_params = static_cast<const TfLiteAffineQuantization*>(
-        output->quantization.params);
    const double effective_scale_1 = static_cast<double>(
-        input_params->scale->data[0] * weights_feature_params->scale->data[0] /
-        state_params->scale->data[0]);
-    const double effective_scale_2 = static_cast<double>(
-        state_params->scale->data[0] * weight_time_params->scale->data[0] /
-        output_params->scale->data[0]);
+        input->params.scale * weights_feature->params.scale /
+        activation_state->params.scale);
+    const double effective_scale_2 =
+        static_cast<double>(activation_state->params.scale *
+                            weights_time->params.scale / output->params.scale);

-    TFLITE_DCHECK(node->user_data != nullptr);
-    OpData* data = static_cast<OpData*>(node->user_data);
+    // TODO(b/162018098): Use TF_LITE_ENSURE_NEAR when it is ready.
+    TF_LITE_ENSURE(
+        context,
+        std::abs(static_cast<double>(bias->params.scale) -
+                 static_cast<double>(activation_state->params.scale *
+                                     weights_time->params.scale)) < 1e-5);

    QuantizeMultiplier(effective_scale_1, &(data->effective_scale_1_a),
                       &(data->effective_scale_1_b));
    QuantizeMultiplier(effective_scale_2, &(data->effective_scale_2_a),
                       &(data->effective_scale_2_b));

+    data->input_zero_point = input->params.zero_point;
+    data->output_zero_point = output->params.zero_point;
+
    TFLITE_DCHECK(context->RequestScratchBufferInArena != nullptr);

    const TfLiteStatus scratch_status = context->RequestScratchBufferInArena(
@ -467,10 +474,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
    if (bias != nullptr) {
      TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteFloat32);
    }
-    TF_LITE_ENSURE_EQ(context, output->type, kTfLiteFloat32);
-
-    TFLITE_DCHECK(node->user_data != nullptr);
-    OpData* data = static_cast<OpData*>(node->user_data);
+    TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteFloat32);

    TFLITE_DCHECK(context->RequestScratchBufferInArena != nullptr);
    const TfLiteStatus scratch_status = context->RequestScratchBufferInArena(
@ -484,20 +488,24 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {

 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
  auto* params = reinterpret_cast<TfLiteSVDFParams*>(node->builtin_data);
-
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* weights_feature =
-      GetInput(context, node, kWeightsFeatureTensor);
-  const TfLiteTensor* weights_time =
-      GetInput(context, node, kWeightsTimeTensor);
-  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
-  TfLiteTensor* activation_state =
-      GetVariableInput(context, node, kInputActivationStateTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-
  TFLITE_DCHECK(node->user_data != nullptr);
  const OpData& data = *(static_cast<const OpData*>(node->user_data));

+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  const TfLiteEvalTensor* weights_feature =
+      tflite::micro::GetEvalInput(context, node, kWeightsFeatureTensor);
+  const TfLiteEvalTensor* weights_time =
+      tflite::micro::GetEvalInput(context, node, kWeightsTimeTensor);
+  const TfLiteEvalTensor* bias =
+      (NumInputs(node) == 5)
+          ? tflite::micro::GetEvalInput(context, node, kBiasTensor)
+          : nullptr;
+  TfLiteEvalTensor* activation_state = tflite::micro::GetMutableEvalInput(
+      context, node, kInputActivationStateTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+
  switch (weights_feature->type) {
    case kTfLiteFloat32: {
      EvalFloatSVDF(context, node, input, weights_feature, weights_time, bias,
@ -508,11 +516,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
    }

    case kTfLiteInt8: {
-      TF_LITE_ENSURE_EQ(context, params->activation, kTfLiteActRelu);
-
      EvalIntegerSVDF(context, node, input, weights_feature, weights_time, bias,
-                      params, activation_state, output, data,
-                      input->params.zero_point, output->params.zero_point);
+                      params, activation_state, output, data);
      return kTfLiteOk;
      break;
    }
@ -527,16 +532,15 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {

 }  // namespace svdf

-TfLiteRegistration* Register_SVDF() {
-  static TfLiteRegistration r = {/*init=*/svdf::Init,
-                                 /*free=*/nullptr,
-                                 /*prepare=*/svdf::Prepare,
-                                 /*invoke=*/svdf::Eval,
-                                 /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
-                                 /*version=*/0};
-  return &r;
+TfLiteRegistration Register_SVDF() {
+  return {/*init=*/svdf::Init,
+          /*free=*/nullptr,
+          /*prepare=*/svdf::Prepare,
+          /*invoke=*/svdf::Eval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
 }

 }  // namespace micro
--- a/libs/tensorflow/tensorflow/lite/micro/kernels/unpack.cc
+++ b/libs/tensorflow/tensorflow/lite/micro/kernels/unpack.cc
@ -17,6 +17,7 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"

 namespace tflite {
 namespace ops {
@ -28,14 +29,16 @@ constexpr int kInputTensor = 0;

 template <typename T>
 TfLiteStatus UnpackImpl(TfLiteContext* context, TfLiteNode* node,
-                        const TfLiteTensor* input, int output_count, int axis) {
-  const TfLiteTensor* output0 = GetOutput(context, node, 0);
+                        const TfLiteEvalTensor* input, int output_count,
+                        int axis) {
+  const TfLiteEvalTensor* output0 =
+      tflite::micro::GetEvalOutput(context, node, 0);
  const TfLiteIntArray* input_dims = input->dims;
  const TfLiteIntArray* output_dims = output0->dims;
  const int dimensions = input_dims->size;

  if (axis < 0) {
-    axis += NumDimensions(input);
+    axis += input->dims->size;
  }

  TFLITE_DCHECK_LT(axis, dimensions);
@ -54,11 +57,11 @@ TfLiteStatus UnpackImpl(TfLiteContext* context, TfLiteNode* node,
  }
  TFLITE_DCHECK_EQ(output_size, copy_size * outer_size);

-  const T* input_data = GetTensorData<T>(input);
+  const T* input_data = tflite::micro::GetTensorData<T>(input);

  for (int i = 0; i < output_count; ++i) {
-    TfLiteTensor* t = GetOutput(context, node, i);
-    T* output_data = GetTensorData<T>(t);
+    TfLiteEvalTensor* t = tflite::micro::GetEvalOutput(context, node, i);
+    T* output_data = tflite::micro::GetTensorData<T>(t);
    for (int k = 0; k < outer_size; ++k) {
      T* output_ptr = output_data + copy_size * k;
      int loc = k * output_count * copy_size + i * copy_size;
@ -74,7 +77,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
  TfLiteUnpackParams* data =
      reinterpret_cast<TfLiteUnpackParams*>(node->builtin_data);

-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);

  switch (input->type) {
    case kTfLiteFloat32: {
@ -101,16 +105,15 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace
 }  // namespace unpack

-TfLiteRegistration* Register_UNPACK() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
-                                 /*free=*/nullptr,
-                                 /*prepare=*/nullptr,
-                                 /*invoke=*/unpack::Eval,
-                                 /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
-                                 /*version=*/0};
-  return &r;
+TfLiteRegistration Register_UNPACK() {
+  return {/*init=*/nullptr,
+          /*free=*/nullptr,
+          /*prepare=*/nullptr,
+          /*invoke=*/unpack::Eval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
 }

 }  // namespace micro
--- a/libs/tensorflow/tensorflow/lite/micro/memory_helpers.cc
+++ b/libs/tensorflow/tensorflow/lite/micro/memory_helpers.cc
@ -15,9 +15,15 @@ limitations under the License.

 #include "tensorflow/lite/micro/memory_helpers.h"

+#include <cstddef>
 #include <cstdint>

+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/flatbuffer_conversions.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/schema/schema_generated.h"

 namespace tflite {

@ -40,8 +46,7 @@ size_t AlignSizeUp(size_t size, size_t alignment) {
  return aligned_size;
 }

-TfLiteStatus TfLiteTypeSizeOf(TfLiteType type, size_t* size,
-                              ErrorReporter* reporter) {
+TfLiteStatus TfLiteTypeSizeOf(TfLiteType type, size_t* size) {
  switch (type) {
    case kTfLiteFloat32:
      *size = sizeof(float);
@ -67,9 +72,10 @@ TfLiteStatus TfLiteTypeSizeOf(TfLiteType type, size_t* size,
    case kTfLiteComplex64:
      *size = sizeof(float) * 2;
      break;
+    case kTfLiteComplex128:
+      *size = sizeof(double) * 2;
+      break;
    default:
-      reporter->Report("Type %s (%d) not is not supported",
-                       TfLiteTypeGetName(type), type);
      return kTfLiteError;
  }
  return kTfLiteOk;
@ -79,17 +85,71 @@ TfLiteStatus BytesRequiredForTensor(const tflite::Tensor& flatbuffer_tensor,
                                    size_t* bytes, size_t* type_size,
                                    ErrorReporter* error_reporter) {
  int element_count = 1;
-  for (size_t n = 0; n < flatbuffer_tensor.shape()->Length(); ++n) {
-    element_count *= flatbuffer_tensor.shape()->Get(n);
+  // If flatbuffer_tensor.shape == nullptr, then flatbuffer_tensor is a scalar
+  // so has 1 element.
+  if (flatbuffer_tensor.shape() != nullptr) {
+    for (size_t n = 0; n < flatbuffer_tensor.shape()->Length(); ++n) {
+      element_count *= flatbuffer_tensor.shape()->Get(n);
+    }
  }

  TfLiteType tf_lite_type;
  TF_LITE_ENSURE_STATUS(ConvertTensorType(flatbuffer_tensor.type(),
                                          &tf_lite_type, error_reporter));
-  TF_LITE_ENSURE_STATUS(
-      TfLiteTypeSizeOf(tf_lite_type, type_size, error_reporter));
+  TF_LITE_ENSURE_STATUS(TfLiteTypeSizeOf(tf_lite_type, type_size));
  *bytes = element_count * (*type_size);
  return kTfLiteOk;
 }

+TfLiteStatus TfLiteEvalTensorByteLength(const TfLiteEvalTensor* eval_tensor,
+                                        size_t* out_bytes) {
+  TFLITE_DCHECK(out_bytes != nullptr);
+
+  int element_count = 1;
+  // If eval_tensor->dims == nullptr, then tensor is a scalar so has 1 element.
+  if (eval_tensor->dims != nullptr) {
+    for (int n = 0; n < eval_tensor->dims->size; ++n) {
+      element_count *= eval_tensor->dims->data[n];
+    }
+  }
+  size_t type_size;
+  TF_LITE_ENSURE_STATUS(TfLiteTypeSizeOf(eval_tensor->type, &type_size));
+  *out_bytes = element_count * type_size;
+  return kTfLiteOk;
+}
+
+TfLiteStatus AllocateOutputDimensionsFromInput(TfLiteContext* context,
+                                               const TfLiteTensor* input1,
+                                               const TfLiteTensor* input2,
+                                               TfLiteTensor* output) {
+  const TfLiteTensor* input = nullptr;
+
+  TF_LITE_ENSURE(context, input1->dims != nullptr);
+  TF_LITE_ENSURE(context, input2->dims != nullptr);
+  TF_LITE_ENSURE(context, output->dims->size == 0);
+
+  input = input1->dims->size > input2->dims->size ? input1 : input2;
+  TF_LITE_ENSURE(context, output->type == input->type);
+
+  size_t size = 0;
+  TfLiteTypeSizeOf(input->type, &size);
+  const int dimensions_count = tflite::GetTensorShape(input).DimensionsCount();
+  for (int i = 0; i < dimensions_count; i++) {
+    size *= input->dims->data[i];
+  }
+
+  output->bytes = size;
+
+  output->dims =
+      reinterpret_cast<TfLiteIntArray*>(context->AllocatePersistentBuffer(
+          context, TfLiteIntArrayGetSizeInBytes(size)));
+
+  output->dims->size = input->dims->size;
+  for (int i = 0; i < dimensions_count; i++) {
+    output->dims->data[i] = input->dims->data[i];
+  }
+
+  return kTfLiteOk;
+}
+
 }  // namespace tflite
--- a/libs/tensorflow/tensorflow/lite/micro/memory_helpers.h
+++ b/libs/tensorflow/tensorflow/lite/micro/memory_helpers.h
@ -15,6 +15,9 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_MICRO_MEMORY_HELPERS_H_
 #define TENSORFLOW_LITE_MICRO_MEMORY_HELPERS_H_

+#include <cstddef>
+#include <cstdint>
+
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/schema/schema_generated.h"
@ -31,14 +34,26 @@ uint8_t* AlignPointerDown(uint8_t* data, size_t alignment);
 size_t AlignSizeUp(size_t size, size_t alignment);

 // Returns size in bytes for a given TfLiteType.
-TfLiteStatus TfLiteTypeSizeOf(TfLiteType type, size_t* size,
-                              ErrorReporter* reporter);
+TfLiteStatus TfLiteTypeSizeOf(TfLiteType type, size_t* size);

 // How many bytes are needed to hold a tensor's contents.
 TfLiteStatus BytesRequiredForTensor(const tflite::Tensor& flatbuffer_tensor,
                                    size_t* bytes, size_t* type_size,
                                    ErrorReporter* error_reporter);

+// How many bytes are used in a TfLiteEvalTensor instance. The byte length is
+// returned in out_bytes.
+TfLiteStatus TfLiteEvalTensorByteLength(const TfLiteEvalTensor* eval_tensor,
+                                        size_t* out_bytes);
+
+// Deduce output dimensions from input and allocate given size.
+// Useful for operators with two inputs where the largest input should equal the
+// output dimension.
+TfLiteStatus AllocateOutputDimensionsFromInput(TfLiteContext* context,
+                                               const TfLiteTensor* input1,
+                                               const TfLiteTensor* input2,
+                                               TfLiteTensor* output);
+
 }  // namespace tflite

 #endif  // TENSORFLOW_LITE_MICRO_MEMORY_HELPERS_H_
--- a/libs/tensorflow/tensorflow/lite/micro/memory_planner/greedy_memory_planner.cc
+++ b/libs/tensorflow/tensorflow/lite/micro/memory_planner/greedy_memory_planner.cc
@ -48,10 +48,10 @@ GreedyMemoryPlanner::GreedyMemoryPlanner(unsigned char* scratch_buffer,
  requirements_ = reinterpret_cast<BufferRequirements*>(next_free);
  next_free += sizeof(BufferRequirements) * max_buffer_count_;

-  buffer_sizes_sorted_by_size_ = reinterpret_cast<int*>(next_free);
+  buffer_sizes_sorted_ = reinterpret_cast<int*>(next_free);
  next_free += sizeof(int) * max_buffer_count_;

-  buffer_ids_sorted_by_size_ = reinterpret_cast<int*>(next_free);
+  buffer_ids_sorted_ = reinterpret_cast<int*>(next_free);
  next_free += sizeof(int) * max_buffer_count_;

  buffers_sorted_by_offset_ = reinterpret_cast<ListEntry*>(next_free);
@ -76,11 +76,24 @@ TfLiteStatus GreedyMemoryPlanner::AddBuffer(
  current->size = size;
  current->first_time_used = first_time_used;
  current->last_time_used = last_time_used;
+  current->offline_offset = kOnlinePlannedBuffer;
  ++buffer_count_;
  need_to_calculate_offsets_ = true;
  return kTfLiteOk;
 }

+TfLiteStatus GreedyMemoryPlanner::AddBuffer(
+    tflite::ErrorReporter* error_reporter, int size, int first_time_used,
+    int last_time_used, int offline_offset) {
+  BufferRequirements* current = &requirements_[buffer_count_];
+  if (AddBuffer(error_reporter, size, first_time_used, last_time_used) !=
+      kTfLiteOk) {
+    return kTfLiteError;
+  }
+  current->offline_offset = offline_offset;
+  return kTfLiteOk;
+}
+
 bool GreedyMemoryPlanner::DoesEntryOverlapInTime(
    const GreedyMemoryPlanner::ListEntry* entry, const int first_time_used,
    const int last_time_used) const {
@ -102,7 +115,7 @@ GreedyMemoryPlanner::NextSimultaneouslyActiveBuffer(
  ListEntry* result = nullptr;
  ListEntry* candidate_next_entry;
  if (start == nullptr) {
-    candidate_next_entry = &buffers_sorted_by_offset_[0];
+    candidate_next_entry = &buffers_sorted_by_offset_[first_entry_index_];
  } else {
    if (start->next_entry_index == -1) {
      return nullptr;
@ -134,29 +147,51 @@ void GreedyMemoryPlanner::CalculateOffsetsIfNeeded() {
  // This helps find a more compact layout. Intuitively, you can think
  // about putting the large buffers in place first, and then the
  // smaller buffers can fit in the gaps, rather than fragmenting the
-  // gaps with small buffers at the beginning.
+  // gaps with small buffers at the beginning. Add offline planned offsets
+  // first in the list, since they have a predetermined offset.
+  int idx_from_tail = buffer_count_;
+  int idx_from_head = 0;
  for (int i = 0; i < buffer_count_; ++i) {
-    buffer_sizes_sorted_by_size_[i] = requirements_[i].size;
-    buffer_ids_sorted_by_size_[i] = i;
-    buffer_offsets_[i] = -1;
+    if (requirements_[i].offline_offset == kOnlinePlannedBuffer) {
+      idx_from_tail--;
+      buffer_sizes_sorted_[idx_from_tail] = requirements_[i].size;
+      buffer_ids_sorted_[idx_from_tail] = i;
+      buffer_offsets_[i] = -1;
+    } else {
+      buffer_sizes_sorted_[idx_from_head] = requirements_[i].size;
+      buffer_ids_sorted_[idx_from_head] = i;
+      buffer_offsets_[i] = requirements_[i].offline_offset;
+      idx_from_head++;
+    }
  }
-  // This sorting algorithm is naive, and may end up taking a very long time
-  // with hundreds of buffers.
-  ReverseSortInPlace(buffer_sizes_sorted_by_size_, buffer_ids_sorted_by_size_,
-                     buffer_count_);

-  // Put the largest buffer at offset zero to start the process.
-  ListEntry* first_entry = &buffers_sorted_by_offset_[0];
-  first_entry->offset = 0;
-  first_entry->requirements_index = buffer_ids_sorted_by_size_[0];
-  first_entry->next_entry_index = -1;
+  // This sorting algorithm is naive, and may end up taking a very long time
+  // with hundreds of buffers. Do not sort the offline planned offsets.
+  ReverseSortInPlace(&buffer_sizes_sorted_[idx_from_head],
+                     &buffer_ids_sorted_[idx_from_head],
+                     buffer_count_ - idx_from_head);
+
+  // Initialize the first entry to the first buffer in
+  // buffer_ids_sorted_.
+  //   - If there are no offline planned offsets, the largest buffer will be
+  //     first, and the buffers will be handled in size order.
+  //   - If offline offsets are present, these will be handled first in order
+  //     for the greedy algorithm to utilized gaps in the offline plan.
+  first_entry_index_ = 0;
  next_free_entry_ = 1;
-  buffer_offsets_[buffer_ids_sorted_by_size_[0]] = 0;
+  ListEntry* first_entry = &buffers_sorted_by_offset_[first_entry_index_];
+  first_entry->next_entry_index = -1;  // to mark the entry as end of list
+  int buffer_id = buffer_ids_sorted_[0];
+  first_entry->requirements_index = buffer_id;
+  if (requirements_[buffer_id].offline_offset == kOnlinePlannedBuffer) {
+    buffer_offsets_[buffer_id] = 0;
+  }
+  first_entry->offset = buffer_offsets_[buffer_id];

  // Work through the rest of the buffers to find a good gap to place each one.
  for (int i = 1; i < buffer_count_; ++i) {
    // The id is the order the buffer was originally added by the client.
-    const int buffer_id = buffer_ids_sorted_by_size_[i];
+    buffer_id = buffer_ids_sorted_[i];
    // Look at what size and time range the buffer needs to be active.
    BufferRequirements* wanted_requirements = &requirements_[buffer_id];
    const int wanted_size = wanted_requirements->size;
@ -168,37 +203,43 @@ void GreedyMemoryPlanner::CalculateOffsetsIfNeeded() {
    // so that it's easy to find the next buffer in memory, and so the gap.
    // The candidate_entry variable holds the buffer that we're considering
    // placing the current buffer after.
-    ListEntry* prior_entry = nullptr;
+
    int candidate_offset = 0;
    // Loop through the offset-ordered list of buffers, looking for gaps.
-    while (true) {
-      // Find out what the next active buffer is.
-      ListEntry* next_entry = NextSimultaneouslyActiveBuffer(
-          prior_entry, wanted_first_time_used, wanted_last_time_used);
+    if (wanted_requirements->offline_offset == kOnlinePlannedBuffer) {
+      ListEntry* prior_entry = nullptr;
+      while (true) {
+        // Find out what the next active buffer is.
+        ListEntry* next_entry = NextSimultaneouslyActiveBuffer(
+            prior_entry, wanted_first_time_used, wanted_last_time_used);

-      if (prior_entry) {
-        BufferRequirements* candidate_requirements =
-            &requirements_[prior_entry->requirements_index];
-        const int prior_entry_offset =
-            prior_entry->offset + candidate_requirements->size;
-        if (prior_entry_offset > candidate_offset) {
-          candidate_offset = prior_entry_offset;
+        if (prior_entry) {
+          BufferRequirements* candidate_requirements =
+              &requirements_[prior_entry->requirements_index];
+          const int prior_entry_offset =
+              prior_entry->offset + candidate_requirements->size;
+          if (prior_entry_offset > candidate_offset) {
+            candidate_offset = prior_entry_offset;
+          }
        }
+        if (next_entry == nullptr) {
+          // We're at the end of the list, so we can always append the buffer
+          // here.
+          break;
+        }
+        // Find out how much space there is between us and the next buffer.
+        const int gap = next_entry->offset - candidate_offset;
+        if (gap >= wanted_size) {
+          // This entry has a big enough gap between it and the next, so
+          // use it!
+          break;
+        }
+        // The gap wasn't big enough, so move on to another candidate.
+        prior_entry = next_entry;
      }
-      if (next_entry == nullptr) {
-        // We're at the end of the list, so we can always append the buffer
-        // here.
-        break;
-      }
-      // Find out how much space there is between us and the next buffer.
-      const int gap = next_entry->offset - candidate_offset;
-      if (gap >= wanted_size) {
-        // This entry has a big enough gap between it and the next, so
-        // use it!
-        break;
-      }
-      // The gap wasn't big enough, so move on to another candidate.
-      prior_entry = next_entry;
+    } else {
+      // Offline planned offset are to be considered constant
+      candidate_offset = wanted_requirements->offline_offset;
    }
    // At this point, we've either found a gap (possibly at the end of the
    // list) and want to place the buffer there, or there are no other active
@ -212,26 +253,36 @@ void GreedyMemoryPlanner::CalculateOffsetsIfNeeded() {
    new_entry->requirements_index = buffer_id;
    const int new_entry_index = next_free_entry_;
    ++next_free_entry_;
-    ListEntry* current_entry = first_entry;
-    // Make sure that we insert the buffer at the correct place in the ordered
-    // list.
-    while (true) {
-      const int next_entry_index = current_entry->next_entry_index;
-      if (next_entry_index == -1) {
-        // We're at the end of the list, so just add the new entry here.
-        current_entry->next_entry_index = new_entry_index;
-        new_entry->next_entry_index = -1;
-        break;
+
+    if (first_entry->offset > candidate_offset) {
+      // The new entry offset is smaller than the first entry offset =>
+      // replace the first entry
+      first_entry = new_entry;
+      first_entry->next_entry_index = first_entry_index_;
+      first_entry_index_ = new_entry_index;
+    } else {
+      ListEntry* current_entry = first_entry;
+      // Make sure that we insert the buffer at the correct place in the
+      // buffer-offset-ordered list
+      while (true) {
+        const int next_entry_index = current_entry->next_entry_index;
+        if (next_entry_index == -1) {
+          // We're at the end of the list, so just add the new entry here.
+          current_entry->next_entry_index = new_entry_index;
+          new_entry->next_entry_index = -1;
+          break;
+        }
+        // not at the end of the list -> take a look at next entry
+        ListEntry* next_entry = &buffers_sorted_by_offset_[next_entry_index];
+        if (next_entry->offset > candidate_offset) {
+          // We're at the right spot to do an insertion and retain the sorting
+          // order, so place the new entry here.
+          new_entry->next_entry_index = current_entry->next_entry_index;
+          current_entry->next_entry_index = new_entry_index;
+          break;
+        }
+        current_entry = next_entry;
      }
-      ListEntry* next_entry = &buffers_sorted_by_offset_[next_entry_index];
-      if (next_entry->offset > candidate_offset) {
-        // We're at the right spot to do an insertion and retain the sorting
-        // order, so place the new entry here.
-        new_entry->next_entry_index = current_entry->next_entry_index;
-        current_entry->next_entry_index = new_entry_index;
-        break;
-      }
-      current_entry = next_entry;
    }
  }
 }
@ -241,7 +292,7 @@ size_t GreedyMemoryPlanner::GetMaximumMemorySize() {
  if (buffer_count_ == 0) {
    return 0;
  }
-  ListEntry* entry = &buffers_sorted_by_offset_[0];
+  ListEntry* entry = &buffers_sorted_by_offset_[first_entry_index_];
  size_t max_size = 0;
  while (entry) {
    BufferRequirements* requirements =
--- a/libs/tensorflow/tensorflow/lite/micro/memory_planner/greedy_memory_planner.h
+++ b/libs/tensorflow/tensorflow/lite/micro/memory_planner/greedy_memory_planner.h
@ -21,6 +21,8 @@ limitations under the License.

 namespace tflite {

+constexpr int kOnlinePlannedBuffer = -1;
+
 // A memory planner that uses a greedy algorithm to arrange buffers in memory
 // to minimize the overall arena size needed.
 //
@ -59,6 +61,12 @@ class GreedyMemoryPlanner : public MemoryPlanner {
  TfLiteStatus AddBuffer(ErrorReporter* error_reporter, int size,
                         int first_time_used, int last_time_used) override;

+  // Record details of an offline planned buffer offset we want to place.
+  // offline_offset is the buffer offset from the start of the arena.
+  TfLiteStatus AddBuffer(ErrorReporter* error_reporter, int size,
+                         int first_time_used, int last_time_used,
+                         int offline_offset);
+
  // Returns the high-water mark of used memory. This is the minimum size of a
  // memory arena you'd need to allocate to hold these buffers.
  size_t GetMaximumMemorySize() override;
@ -90,8 +98,8 @@ class GreedyMemoryPlanner : public MemoryPlanner {
  static size_t per_buffer_size() {
    const int per_buffer_size =
        sizeof(BufferRequirements) +  // requirements_
-        sizeof(int) +                 // buffer_sizes_sorted_by_size_
-        sizeof(int) +                 // buffer_ids_sorted_by_size_
+        sizeof(int) +                 // buffer_sizes_sorted_
+        sizeof(int) +                 // buffer_ids_sorted_
        sizeof(ListEntry) +           // buffers_sorted_by_offset_
        sizeof(int);                  // buffer_offsets_;
    return per_buffer_size;
@ -121,16 +129,25 @@ class GreedyMemoryPlanner : public MemoryPlanner {
  // Records the client-provided information about each buffer.
  struct BufferRequirements {
    int size;
+    int offline_offset;
    int first_time_used;
    int last_time_used;
  };

  // Working arrays used during the layout algorithm.
  BufferRequirements* requirements_;
-  int* buffer_sizes_sorted_by_size_;
-  int* buffer_ids_sorted_by_size_;
+  // buffer_sizes_sorted_ and buffer_ids_sorted_ are sorted according to:
+  //   {
+  //     offline planned buffers,
+  //     online planned buffers sorted by size
+  //   }
+  int* buffer_sizes_sorted_;
+  int* buffer_ids_sorted_;
  ListEntry* buffers_sorted_by_offset_;
-  int next_free_entry_;
+  int next_free_entry_;    // Index of the next free entry of
+                           // buffers_sorted_by_offset_
+  int first_entry_index_;  // Index of the first entry (smallest offset) of
+                           // buffers_sorted_by_offset_

  // Stores the outcome of the plan, the location of each buffer in the arena.
  int* buffer_offsets_;
--- a/libs/tensorflow/tensorflow/lite/micro/micro_allocator.cc
+++ b/libs/tensorflow/tensorflow/lite/micro/micro_allocator.cc
--- a/libs/tensorflow/tensorflow/lite/micro/micro_allocator.h
+++ b/libs/tensorflow/tensorflow/lite/micro/micro_allocator.h
@ -1,5 +1,5 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
+b/160894903
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
@ -15,9 +15,14 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_MICRO_MICRO_ALLOCATOR_H_
 #define TENSORFLOW_LITE_MICRO_MICRO_ALLOCATOR_H_

+#include <cstddef>
+#include <cstdint>
+
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
-#include "tensorflow/lite/core/api/flatbuffer_conversions.h"
+#include "tensorflow/lite/micro/compatibility.h"
+#include "tensorflow/lite/micro/micro_op_resolver.h"
 #include "tensorflow/lite/micro/simple_memory_allocator.h"
 #include "tensorflow/lite/schema/schema_generated.h"

@ -26,16 +31,21 @@ namespace tflite {
 // Namespace used for unittests.
 namespace internal {

-// Sets up all of the data structure members for a runtime tensor
-// based on the contents of a serialized tensor.
-TfLiteStatus InitializeRuntimeTensor(
-    SimpleMemoryAllocator* allocator, const tflite::Tensor& flatbuffer_tensor,
+// Sets up all of the data structure members for a TfLiteTensor based on the
+// contents of a serialized tensor in the flatbuffer.
+// TODO(b/160894903): Once all kernels have been updated to the new
+// TfLiteEvalTensor API - drop the allocate_temp flag. This enables internal
+// flatbuffer quantization or dimension allocations to take place in either the
+// temp or tail section of the arena.
+TfLiteStatus InitializeTfLiteTensorFromFlatbuffer(
+    SimpleMemoryAllocator* allocator, bool allocate_temp,
+    const tflite::Tensor& flatbuffer_tensor,
    const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* buffers,
    ErrorReporter* error_reporter, TfLiteTensor* result);

 // A handle tracking scratch buffer allocation. This handle is created by
 // `RequestScratchBufferInArena`. `data` field is populated in
-// `FinishTensorAllocation` after static memory planning.
+// `FinishModelAllocation` after static memory planning.
 // TODO(b/150257460) As a future optimization, this struct could be replaced by
 // a union, since once `data` is populated, `bytes` and `node_idx` is not
 // needed.
@ -59,7 +69,17 @@ typedef struct {

 // Allocator responsible for allocating memory for all intermediate tensors
 // necessary to invoke a model.
-
+//
+// The lifetime of the model, tensor arena and error reporter must be at
+// least as long as that of the allocator object, since the allocator needs
+// them to be accessible during its entire lifetime.
+//
+// The MicroAllocator simply plans out additional allocations that are required
+// to standup a model for inference in TF Micro. This class currently relies on
+// an additional allocator - SimpleMemoryAllocator - for all allocations from an
+// arena. These allocations are divided into head (non-persistent) and tail
+// (persistent) regions:
+//
 // Memory layout to help understand how it works
 // This information could change in the future version.
 // ************** .memory_allocator->GetBuffer()
@ -72,42 +92,73 @@ typedef struct {
 // ************** .memory_allocator->GetBuffer() + ->GetMaxBufferSize()
 class MicroAllocator {
 public:
-  // The lifetime of the model, tensor allocator and error reporter must be at
-  // least as long as that of the allocator object, since the allocator needs
-  // them to be accessible during its entire lifetime.
-
+  // Creates a MicroAllocator instance from a given tensor arena. This arena
+  // will be managed by the created instance.
  // Note: Please use __declspec(align(16)) to make sure tensor_arena is 16
  // bytes aligned, otherwise some head room will be wasted.
-  MicroAllocator(TfLiteContext* context, const Model* model,
-                 uint8_t* tensor_arena, size_t arena_size,
-                 ErrorReporter* error_reporter);
+  // TODO(b/157615197): Cleanup constructor + factory usage.
+  static MicroAllocator* Create(uint8_t* tensor_arena, size_t arena_size,
+                                ErrorReporter* error_reporter);

-  // Runs through the model and allocates all necessary input, output and
-  // intermediate tensors.
-  // WARNING: doing any allocation after calling this method has the risk of
-  // corrupting tensor data so this method should be the last non-const method
-  // called in this class.
-  TfLiteStatus FinishTensorAllocation();
+  // Creates a MicroAllocator instance using the provided SimpleMemoryAllocator
+  // intance. This allocator instance will use the SimpleMemoryAllocator
+  // instance to manage allocations internally.
+  static MicroAllocator* Create(SimpleMemoryAllocator* memory_allocator,
+                                ErrorReporter* error_reporter);

-  // Returns the arena usage in bytes, only available after
-  // `FinishTensorAllocation`. Otherwise, it will return 0.
-  size_t used_bytes() const;
+  // Begin allocating internal resources required for model inference.
+  // This method will run through the flatbuffer data supplied in the model to
+  // properly allocate tensor, node, and op registration data. This method is
+  // expected to be followed with a call to FinishModelAllocation() before
+  // resuming allocation with another model. All persistent tensor buffers are
+  // stored in the out-param eval_tensors. This value is allocated from the
+  // persistent memory arena and will be used to host runtime tensor buffers.
+  TfLiteStatus StartModelAllocation(
+      const Model* model, const MicroOpResolver& op_resolver,
+      NodeAndRegistration** node_and_registrations,
+      TfLiteEvalTensor** eval_tensors);

-  // Run through the model to allocate nodes and registrations. We need to keep
-  // them for the entire life time of the model to allow persistent tensors.
-  // This method needs to be called before FinishTensorAllocation method.
-  TfLiteStatus AllocateNodeAndRegistrations(
-      const OpResolver& op_resolver,
-      NodeAndRegistration** node_and_registrations);
+  // Finish allocating internal resources required for model inference.
+  // This method will plan non-persistent buffers and commit a memory plan to
+  // the 'head' section of the memory arena. All variable tensor data will also
+  // be allocated. This method should be called after assigning model resources
+  // in StartModelAllocation(). The eval_tensors pointer should be the value
+  // passed into this class during StartModelAllocation().
+  TfLiteStatus FinishModelAllocation(const Model* model,
+                                     TfLiteEvalTensor* eval_tensors);
+
+  // Allocates a TfLiteTensor struct and populates the returned value with
+  // properties from the model flatbuffer. This struct is allocated from
+  // persistent arena memory is only guaranteed for the lifetime of the
+  // application. The eval_tensors pointer should be the value passed into this
+  // class during StartModelAllocation() and contains the source-of-truth for
+  // buffers.
+  virtual TfLiteTensor* AllocatePersistentTfLiteTensor(
+      const Model* model, TfLiteEvalTensor* eval_tensors, int tensor_index);
+
+  // Allocates a TfLiteTensor struct and populates the returned value with
+  // properties from the model flatbuffer. This struct is allocated from
+  // temporary arena memory is only guaranteed until a call is made to
+  // ResetTempAllocations(). The eval_tensors pointer should be the value passed
+  // into this class during StartModelAllocation() and contains the
+  // source-of-truth for buffers.
+  virtual TfLiteTensor* AllocateTempTfLiteTensor(const Model* model,
+                                                 TfLiteEvalTensor* eval_tensors,
+                                                 int tensor_index);
+
+  // Resets all temporary allocations. This method should be called after a
+  // chain of temp allocations (e.g. chain of TfLiteTensor objects via
+  // AllocateTfLiteTensor()).
+  virtual void ResetTempAllocations();

  // Allocates persistent buffer which has the same life time as the allocator.
  // The memory is immediately available and is allocated from the tail of the
  // arena.
-  TfLiteStatus AllocatePersistentBuffer(size_t bytes, void** ptr);
+  void* AllocatePersistentBuffer(size_t bytes);

  // Register a scratch buffer of size `bytes` for Node with `node_id`.
  // This method only allocates a BufferHandle holding information for memory
-  // planning. The buffer ptr is ready after `FinishTensorAllocation` and can
+  // planning. The buffer ptr is ready after `FinishModelAllocation` and can
  // be retrieved by `GetScratchBuffer` method using the returned buffer_idx.
  // Note that there should be no tail allocation between two consecutive
  // `RequestScratchBufferInArena` calls.
@ -116,16 +167,74 @@ class MicroAllocator {
  // Returns the pointer to the planned scratch buffer.
  void* GetScratchBuffer(int buffer_idx) const;

- private:
-  TfLiteStatus Init();
+  // Returns the arena usage in bytes, only available after
+  // `FinishModelAllocation`. Otherwise, it will return 0.
+  size_t used_bytes() const;

-  const Model* model_;
-  // A simple memory allocator that always allocate from the arena tail.
+ protected:
+  MicroAllocator(SimpleMemoryAllocator* memory_allocator,
+                 ErrorReporter* error_reporter);
+  virtual ~MicroAllocator();
+
+  // Allocates an array in the arena to hold pointers to the node and
+  // registration pointers required to represent the inference graph of the
+  // model.
+  virtual TfLiteStatus AllocateNodeAndRegistrations(
+      const Model* model, NodeAndRegistration** node_and_registrations);
+
+  // Populates node and registration pointers representing the inference graph
+  // of the model from values inside the flatbuffer (loaded from the TfLiteModel
+  // instance). Persistent data (e.g. operator data) is allocated from the
+  // arena.
+  virtual TfLiteStatus PrepareNodeAndRegistrationDataFromFlatbuffer(
+      const Model* model, const MicroOpResolver& op_resolver,
+      NodeAndRegistration* node_and_registrations);
+
+  // Allocates the list of persistent TfLiteEvalTensors that are used for the
+  // "eval" phase of model inference. These structs will be the source of truth
+  // for all tensor buffers. Allocation results are stored in the out-param
+  // eval_tensors.
+  virtual TfLiteStatus AllocateTfLiteEvalTensors(
+      const Model* model, TfLiteEvalTensor** eval_tensors);
+
+  // Allocates persistent tensor buffers for variable tensors in the subgraph.
+  virtual TfLiteStatus AllocateVariables(const SubGraph* subgraph,
+                                         TfLiteEvalTensor* eval_tensors);
+
+  // TODO(b/160894903): Once all kernels have been updated to the new API drop
+  // this method. It is only used to record TfLiteTensor persistent allocations.
+  virtual TfLiteTensor* AllocatePersistentTfLiteTensorInternal(
+      const Model* model, TfLiteEvalTensor* eval_tensors, int tensor_index);
+
+  // Populates a TfLiteTensor struct with data from the model flatbuffer. Any
+  // quantization data is allocated from either the tail (persistent) or temp
+  // sections of the arena based on the allocation flag.
+  // TODO(b/160894903): Once all kernels have been updated to the new API drop
+  // this function since all allocations for quantized data will take place in
+  // the temp section.
+  virtual TfLiteStatus PopulateTfLiteTensorFromFlatbuffer(
+      const Model* model, const SubGraph* subgraph, TfLiteTensor* tensor,
+      int tensor_index, bool allocate_temp);
+
+  ErrorReporter* error_reporter() const;
+
+  // Returns the first subgraph from the model.
+  const SubGraph* GetSubGraphFromModel(const Model* model);
+
+ private:
+  // Commits a memory plan for all non-persistent buffer allocations in the
+  // 'head' section of the memory arena. The eval_tensors pointer is the list of
+  // pre-allocated TfLiteEvalTensor structs that will point to the buffers that
+  // will be allocated into the head section in this function call.
+  virtual TfLiteStatus CommitStaticMemoryPlan(const Model* model,
+                                              const SubGraph* subgraph,
+                                              TfLiteEvalTensor* eval_tensors);
+
+  // A simple memory allocator that always allocate from the arena tail or head.
  SimpleMemoryAllocator* memory_allocator_;
+
  ErrorReporter* error_reporter_;
-  TfLiteContext* context_;
-  // Indicating if the allocator is ready for allocation.
-  bool active_ = false;
+  bool model_is_allocating_;

  // In reverse order for efficiency.
  // i.e. scratch_buffer_handles_[0] is the handle for the last buffer,
@ -134,9 +243,7 @@ class MicroAllocator {
  // How many scratch buffers have been allocated.
  size_t scratch_buffer_count_ = 0;

-  const SubGraph* subgraph_;
-  const flatbuffers::Vector<flatbuffers::Offset<Operator>>* operators_;
-  const flatbuffers::Vector<flatbuffers::Offset<Tensor>>* tensors_;
+  TF_LITE_REMOVE_VIRTUAL_DELETE
 };

 }  // namespace tflite
--- a/libs/tensorflow/tensorflow/lite/micro/micro_error_reporter.cc
+++ b/libs/tensorflow/tensorflow/lite/micro/micro_error_reporter.cc
@ -15,7 +15,10 @@ limitations under the License.

 #include "tensorflow/lite/micro/micro_error_reporter.h"

+#include <cstdarg>
+
 #ifndef TF_LITE_STRIP_ERROR_STRINGS
+#include "tensorflow/lite/micro/debug_log.h"
 #include "tensorflow/lite/micro/micro_string.h"
 #endif

--- a/libs/tensorflow/tensorflow/lite/micro/micro_error_reporter.h
+++ b/libs/tensorflow/tensorflow/lite/micro/micro_error_reporter.h
@ -12,12 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+// Minor modifications made for Espruino Microcontroller build by Gordon Williams <gw@pur3.co.uk>
 #ifndef TENSORFLOW_LITE_MICRO_MICRO_ERROR_REPORTER_H_
 #define TENSORFLOW_LITE_MICRO_MICRO_ERROR_REPORTER_H_

+#include <cstdarg>
+
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/micro/compatibility.h"
-#include "tensorflow/lite/micro/debug_log.h"

 namespace tflite {

@ -25,8 +27,6 @@ class MicroErrorReporter : public ErrorReporter {
 public:
  ~MicroErrorReporter() override {}
  int Report(const char* format, va_list args) override;
-
- private:
  TF_LITE_REMOVE_VIRTUAL_DELETE
 };

--- a/libs/tensorflow/tensorflow/lite/micro/micro_error_reporter.h.orig
+++ b/libs/tensorflow/tensorflow/lite/micro/micro_error_reporter.h.orig
@ -15,15 +15,16 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_MICRO_MICRO_ERROR_REPORTER_H_
 #define TENSORFLOW_LITE_MICRO_MICRO_ERROR_REPORTER_H_

+#include <cstdarg>
+
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/micro/compatibility.h"
-#include "tensorflow/lite/micro/debug_log.h"

 namespace tflite {

 class MicroErrorReporter : public ErrorReporter {
 public:
-  ~MicroErrorReporter() {}
+  ~MicroErrorReporter() override {}
  int Report(const char* format, va_list args) override;

 private:
--- a/libs/tensorflow/tensorflow/lite/micro/micro_interpreter.cc
+++ b/libs/tensorflow/tensorflow/lite/micro/micro_interpreter.cc
@ -14,16 +14,24 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/micro/micro_interpreter.h"

+#include <cstdarg>
+#include <cstddef>
+#include <cstdint>
+
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/core/api/flatbuffer_conversions.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/tensor_utils.h"
-#include "tensorflow/lite/micro/compatibility.h"
+#include "tensorflow/lite/micro/memory_helpers.h"
 #include "tensorflow/lite/micro/micro_allocator.h"
-#include "tensorflow/lite/micro/micro_optional_debug_tools.h"
+#include "tensorflow/lite/micro/micro_op_resolver.h"
+#include "tensorflow/lite/micro/micro_profiler.h"
+#include "tensorflow/lite/schema/schema_generated.h"

 namespace tflite {
 namespace {

+#ifndef TF_LITE_STRIP_ERROR_STRINGS
 const char* OpNameFromRegistration(const TfLiteRegistration* registration) {
  if (registration->builtin_code == BuiltinOperator_CUSTOM) {
    return registration->custom_name;
@ -31,15 +39,20 @@ const char* OpNameFromRegistration(const TfLiteRegistration* registration) {
    return EnumNameBuiltinOperator(BuiltinOperator(registration->builtin_code));
  }
 }
+#endif  // !defined(TF_LITE_STRIP_ERROR_STRINGS)

 }  // namespace

 namespace internal {

-TfLiteStatus ContextHelper::AllocatePersistentBuffer(TfLiteContext* ctx,
-                                                     size_t bytes, void** ptr) {
+ContextHelper::ContextHelper(ErrorReporter* error_reporter,
+                             MicroAllocator* allocator, const Model* model)
+    : allocator_(allocator), error_reporter_(error_reporter), model_(model) {}
+
+void* ContextHelper::AllocatePersistentBuffer(TfLiteContext* ctx,
+                                              size_t bytes) {
  return reinterpret_cast<ContextHelper*>(ctx->impl_)
-      ->allocator_->AllocatePersistentBuffer(bytes, ptr);
+      ->allocator_->AllocatePersistentBuffer(bytes);
 }

 TfLiteStatus ContextHelper::RequestScratchBufferInArena(TfLiteContext* ctx,
@ -57,55 +70,72 @@ void* ContextHelper::GetScratchBuffer(TfLiteContext* ctx, int buffer_idx) {

 void ContextHelper::ReportOpError(struct TfLiteContext* context,
                                  const char* format, ...) {
+#ifndef TF_LITE_STRIP_ERROR_STRINGS
  ContextHelper* helper = static_cast<ContextHelper*>(context->impl_);
  va_list args;
  va_start(args, format);
  TF_LITE_REPORT_ERROR(helper->error_reporter_, format, args);
  va_end(args);
+#endif
+}
+
+TfLiteTensor* ContextHelper::GetTensor(const struct TfLiteContext* context,
+                                       int tensor_idx) {
+  ContextHelper* helper = static_cast<ContextHelper*>(context->impl_);
+  return helper->allocator_->AllocateTempTfLiteTensor(
+      helper->model_, helper->eval_tensors_, tensor_idx);
+}
+
+TfLiteEvalTensor* ContextHelper::GetEvalTensor(
+    const struct TfLiteContext* context, int tensor_idx) {
+  ContextHelper* helper = reinterpret_cast<ContextHelper*>(context->impl_);
+  return &helper->eval_tensors_[tensor_idx];
+}
+
+void ContextHelper::SetNodeIndex(int idx) { current_node_idx_ = idx; }
+
+void ContextHelper::SetTfLiteEvalTensors(TfLiteEvalTensor* eval_tensors) {
+  eval_tensors_ = eval_tensors;
 }

 }  // namespace internal

 MicroInterpreter::MicroInterpreter(const Model* model,
-                                   const OpResolver& op_resolver,
+                                   const MicroOpResolver& op_resolver,
                                   uint8_t* tensor_arena,
                                   size_t tensor_arena_size,
-                                   ErrorReporter* error_reporter)
+                                   ErrorReporter* error_reporter,
+                                   tflite::Profiler* profiler)
    : model_(model),
      op_resolver_(op_resolver),
      error_reporter_(error_reporter),
-      allocator_(&context_, model_, tensor_arena, tensor_arena_size,
-                 error_reporter_),
+      allocator_(*MicroAllocator::Create(tensor_arena, tensor_arena_size,
+                                         error_reporter)),
      tensors_allocated_(false),
-      context_helper_(error_reporter_, &allocator_) {
-  const flatbuffers::Vector<flatbuffers::Offset<SubGraph>>* subgraphs =
-      model->subgraphs();
-  if (subgraphs->size() != 1) {
-    TF_LITE_REPORT_ERROR(error_reporter,
-                         "Only 1 subgraph is currently supported.\n");
-    initialization_status_ = kTfLiteError;
-    return;
-  }
-  subgraph_ = (*subgraphs)[0];
+      initialization_status_(kTfLiteError),
+      eval_tensors_(nullptr),
+      context_helper_(error_reporter_, &allocator_, model),
+      input_tensor_(nullptr),
+      output_tensor_(nullptr) {
+  Init(profiler);
+}

-  context_.impl_ = static_cast<void*>(&context_helper_);
-  context_.ReportError = context_helper_.ReportOpError;
-  context_.recommended_num_threads = 1;
-
-  // If the system is big endian then convert weights from the flatbuffer from
-  // little to big endian on startup so that it does not need to be done during
-  // inference.
-  // NOTE: This requires that the flatbuffer is held in memory which can be
-  // modified by this process.
-  if (!FLATBUFFERS_LITTLEENDIAN) {
-    for (size_t t = 0; t < tensors_size(); ++t) {
-      TfLiteTensor* thisTensor = &context_.tensors[t];
-      if (thisTensor->allocation_type == kTfLiteMmapRo)
-        CorrectTensorEndianness(thisTensor);
-    }
-  }
-
-  initialization_status_ = kTfLiteOk;
+MicroInterpreter::MicroInterpreter(const Model* model,
+                                   const MicroOpResolver& op_resolver,
+                                   MicroAllocator* allocator,
+                                   ErrorReporter* error_reporter,
+                                   tflite::Profiler* profiler)
+    : model_(model),
+      op_resolver_(op_resolver),
+      error_reporter_(error_reporter),
+      allocator_(*allocator),
+      tensors_allocated_(false),
+      initialization_status_(kTfLiteError),
+      eval_tensors_(nullptr),
+      context_helper_(error_reporter_, &allocator_, model),
+      input_tensor_(nullptr),
+      output_tensor_(nullptr) {
+  Init(profiler);
 }

 MicroInterpreter::~MicroInterpreter() {
@ -123,7 +153,28 @@ MicroInterpreter::~MicroInterpreter() {
  }
 }

-void MicroInterpreter::CorrectTensorEndianness(TfLiteTensor* tensorCorr) {
+void MicroInterpreter::Init(tflite::Profiler* profiler) {
+  const flatbuffers::Vector<flatbuffers::Offset<SubGraph>>* subgraphs =
+      model_->subgraphs();
+  if (subgraphs->size() != 1) {
+    TF_LITE_REPORT_ERROR(error_reporter_,
+                         "Only 1 subgraph is currently supported.\n");
+    initialization_status_ = kTfLiteError;
+    return;
+  }
+  subgraph_ = (*subgraphs)[0];
+
+  context_.impl_ = static_cast<void*>(&context_helper_);
+  context_.ReportError = context_helper_.ReportOpError;
+  context_.GetTensor = context_helper_.GetTensor;
+  context_.GetEvalTensor = context_helper_.GetEvalTensor;
+  context_.recommended_num_threads = 1;
+  context_.profiler = profiler;
+
+  initialization_status_ = kTfLiteOk;
+}
+
+void MicroInterpreter::CorrectTensorEndianness(TfLiteEvalTensor* tensorCorr) {
  int32_t tensorSize = 1;
  for (int d = 0; d < tensorCorr->dims->size; ++d)
    tensorSize *= reinterpret_cast<const int32_t*>(tensorCorr->dims->data)[d];
@ -147,6 +198,9 @@ void MicroInterpreter::CorrectTensorEndianness(TfLiteTensor* tensorCorr) {
    case TfLiteType::kTfLiteComplex64:
      CorrectTensorDataEndianness(tensorCorr->data.c64, tensorSize);
      break;
+    case TfLiteType::kTfLiteComplex128:
+      CorrectTensorDataEndianness(tensorCorr->data.c128, tensorSize);
+      break;
    default:
      // Do nothing for other data types.
      break;
@ -161,8 +215,42 @@ void MicroInterpreter::CorrectTensorDataEndianness(T* data, int32_t size) {
 }

 TfLiteStatus MicroInterpreter::AllocateTensors() {
-  TF_LITE_ENSURE_OK(&context_, allocator_.AllocateNodeAndRegistrations(
-                                   op_resolver_, &node_and_registrations_));
+  if (allocator_.StartModelAllocation(model_, op_resolver_,
+                                      &node_and_registrations_,
+                                      &eval_tensors_) != kTfLiteOk) {
+    TF_LITE_REPORT_ERROR(error_reporter_,
+                         "Failed starting model allocation.\n");
+    initialization_status_ = kTfLiteError;
+    return kTfLiteError;
+  }
+
+  // Update the pointer now that TfLiteEvalTensor allocation has completed on
+  // the context helper.
+  // TODO(b/16157777): This call would not be needed if ContextHelper rolled
+  // into the interpreter.
+  context_helper_.SetTfLiteEvalTensors(eval_tensors_);
+
+  // If the system is big endian then convert weights from the flatbuffer from
+  // little to big endian on startup so that it does not need to be done during
+  // inference.
+  // NOTE: This requires that the flatbuffer is held in memory which can be
+  // modified by this process.
+  if (!FLATBUFFERS_LITTLEENDIAN) {
+    for (size_t t = 0; t < subgraph_->tensors()->size(); ++t) {
+      if (auto* buffer =
+              (*model_->buffers())[subgraph_->tensors()->Get(t)->buffer()]) {
+        // If we've found a buffer, does it have any data?
+        if (auto* array = buffer->data()) {
+          // If it has any data, is the data size larger than zero?
+          if (array->size()) {
+            // Update the endianness of the corresponding eval tensor since that
+            // struct holds the buffer used at inference time.
+            CorrectTensorEndianness(&eval_tensors_[t]);
+          }
+        }
+      }
+    }
+  }

  // Only allow AllocatePersistentBuffer in Init stage.
  context_.AllocatePersistentBuffer = context_helper_.AllocatePersistentBuffer;
@ -189,8 +277,8 @@ TfLiteStatus MicroInterpreter::AllocateTensors() {
  }
  context_helper_.SetNodeIndex(-1);

-  // Both AllocatePersistentBuffer and RequestScratchBufferInArena is available
-  // in Prepare stage.
+  // Both AllocatePersistentBuffer and RequestScratchBufferInArena is
+  // available in Prepare stage.
  context_.RequestScratchBufferInArena =
      context_helper_.RequestScratchBufferInArena;
  for (size_t i = 0; i < subgraph_->operators()->size(); ++i) {
@ -208,6 +296,7 @@ TfLiteStatus MicroInterpreter::AllocateTensors() {
        return kTfLiteError;
      }
    }
+    allocator_.ResetTempAllocations();
  }
  context_helper_.SetNodeIndex(-1);

@ -217,7 +306,10 @@ TfLiteStatus MicroInterpreter::AllocateTensors() {
  context_.RequestScratchBufferInArena = nullptr;
  context_.GetScratchBuffer = context_helper_.GetScratchBuffer;

-  TF_LITE_ENSURE_OK(&context_, allocator_.FinishTensorAllocation());
+  TF_LITE_ENSURE_OK(&context_,
+                    allocator_.FinishModelAllocation(model_, eval_tensors_));
+  TF_LITE_ENSURE_STATUS(ResetVariableTensors());
+
  tensors_allocated_ = true;
  return kTfLiteOk;
 }
@ -240,7 +332,23 @@ TfLiteStatus MicroInterpreter::Invoke() {
    auto* registration = node_and_registrations_[i].registration;

    if (registration->invoke) {
-      TfLiteStatus invoke_status = registration->invoke(&context_, node);
+      TfLiteStatus invoke_status;
+#ifndef NDEBUG  // Omit profiler overhead from release builds.
+      // The case where profiler == nullptr is handled by
+      // ScopedOperatorProfile.
+      tflite::Profiler* profiler =
+          reinterpret_cast<tflite::Profiler*>(context_.profiler);
+      ScopedOperatorProfile scoped_profiler(
+          profiler, OpNameFromRegistration(registration), i);
+#endif
+      invoke_status = registration->invoke(&context_, node);
+
+      // All TfLiteTensor structs used in the kernel are allocated from temp
+      // memory in the allocator. This creates a chain of allocations in the
+      // temp section. The call below resets the chain of allocations to
+      // prepare for the next call.
+      allocator_.ResetTempAllocations();
+
      if (invoke_status == kTfLiteError) {
        TF_LITE_REPORT_ERROR(
            error_reporter_,
@ -257,50 +365,82 @@ TfLiteStatus MicroInterpreter::Invoke() {

 TfLiteTensor* MicroInterpreter::input(size_t index) {
  const size_t length = inputs_size();
-  if ((index < 0) || (index >= length)) {
+  if (index >= length) {
    TF_LITE_REPORT_ERROR(error_reporter_,
                         "Input index %d out of range (length is %d)", index,
                         length);
    return nullptr;
  }
-  return &(context_.tensors[inputs().Get(index)]);
+  if (index != 0) {
+    TF_LITE_REPORT_ERROR(
+        error_reporter_,
+        "Input tensors not at index 0 are allocated from the "
+        "persistent memory arena. Repeat calls will cause excess "
+        "allocation!");
+    return allocator_.AllocatePersistentTfLiteTensor(model_, eval_tensors_,
+                                                     inputs().Get(index));
+  }
+  if (input_tensor_ == nullptr) {
+    input_tensor_ = allocator_.AllocatePersistentTfLiteTensor(
+        model_, eval_tensors_, inputs().Get(index));
+  }
+  return input_tensor_;
 }

 TfLiteTensor* MicroInterpreter::output(size_t index) {
  const size_t length = outputs_size();
-  if ((index < 0) || (index >= length)) {
+  if (index >= length) {
    TF_LITE_REPORT_ERROR(error_reporter_,
                         "Output index %d out of range (length is %d)", index,
                         length);
    return nullptr;
  }
-  return &(context_.tensors[outputs().Get(index)]);
+  if (index != 0) {
+    TF_LITE_REPORT_ERROR(
+        error_reporter_,
+        "Output tensors not at index 0 are allocated from the "
+        "persistent memory arena. Repeat calls will cause excess "
+        "allocation!");
+    return allocator_.AllocatePersistentTfLiteTensor(model_, eval_tensors_,
+                                                     outputs().Get(index));
+  }
+  if (output_tensor_ == nullptr) {
+    // TODO(b/160894903): This API will allocate TfLiteTensor structs from
+    // persistent (tail) memory and cache on this pointer.
+    output_tensor_ = allocator_.AllocatePersistentTfLiteTensor(
+        model_, eval_tensors_, outputs().Get(index));
+  }
+  return output_tensor_;
 }

 TfLiteTensor* MicroInterpreter::tensor(size_t index) {
  const size_t length = tensors_size();
-  if ((index < 0) || (index >= length)) {
+  if (index >= length) {
    TF_LITE_REPORT_ERROR(error_reporter_,
                         "Tensor index %d out of range (length is %d)", index,
                         length);
    return nullptr;
  }
-  return &context_.tensors[index];
+  return allocator_.AllocatePersistentTfLiteTensor(model_, eval_tensors_,
+                                                   index);
 }

 TfLiteStatus MicroInterpreter::ResetVariableTensors() {
-  const size_t length = tensors_size();
-  for (size_t i = 0; i < length; ++i) {
-    TfLiteTensor* cur_tensor = tensor(i);
-    if (cur_tensor->is_variable) {
-      TfLiteStatus status = tflite::ResetVariableTensor(cur_tensor);
-      if (status != kTfLiteOk) {
-        TF_LITE_REPORT_ERROR(error_reporter_,
-                             "Failed to reset variable tensor at index: %d", i);
-        return status;
+  for (size_t i = 0; i < subgraph_->tensors()->size(); ++i) {
+    auto* tensor = subgraph_->tensors()->Get(i);
+    if (tensor->is_variable()) {
+      size_t buffer_size;
+      TF_LITE_ENSURE_STATUS(
+          TfLiteEvalTensorByteLength(&eval_tensors_[i], &buffer_size));
+
+      int value = 0;
+      if (tensor->type() == tflite::TensorType_INT8) {
+        value = tensor->quantization()->zero_point()->Get(0);
      }
+      memset(eval_tensors_[i].data.raw, value, buffer_size);
    }
  }
+
  return kTfLiteOk;
 }

--- a/libs/tensorflow/tensorflow/lite/micro/micro_interpreter.h
+++ b/libs/tensorflow/tensorflow/lite/micro/micro_interpreter.h
@ -15,13 +15,18 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_MICRO_MICRO_INTERPRETER_H_
 #define TENSORFLOW_LITE_MICRO_MICRO_INTERPRETER_H_

+#include <cstddef>
+#include <cstdint>
+
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
-#include "tensorflow/lite/core/api/op_resolver.h"
+#include "tensorflow/lite/core/api/profiler.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/micro/micro_allocator.h"
+#include "tensorflow/lite/micro/micro_op_resolver.h"
+#include "tensorflow/lite/portable_type_to_tflitetype.h"
 #include "tensorflow/lite/schema/schema_generated.h"
-#include "tensorflow/lite/type_to_tflitetype.h"

 namespace tflite {

@ -30,29 +35,36 @@ namespace internal {
 // A helper class to encapsulate the implementation of APIs in Context.
 // context->impl_ points to an instance of this class.
 // Check tensorflow/lite/c/common.h for detailed descriptions.
+// TODO(b/16157777): Consider rolling this class into MicroInterpreter.
 class ContextHelper {
 public:
  explicit ContextHelper(ErrorReporter* error_reporter,
-                         MicroAllocator* allocator)
-      : allocator_(allocator), error_reporter_(error_reporter) {}
-
-  static TfLiteStatus AllocatePersistentBuffer(TfLiteContext* ctx, size_t bytes,
-                                               void** ptr);
+                         MicroAllocator* allocator, const Model* model);

+  // Functions that will be assigned to function pointers on TfLiteContext:
+  static void* AllocatePersistentBuffer(TfLiteContext* ctx, size_t bytes);
  static TfLiteStatus RequestScratchBufferInArena(TfLiteContext* ctx,
                                                  size_t bytes,
                                                  int* buffer_idx);
-
  static void* GetScratchBuffer(TfLiteContext* ctx, int buffer_idx);
-
  static void ReportOpError(struct TfLiteContext* context, const char* format,
                            ...);
+  static TfLiteTensor* GetTensor(const struct TfLiteContext* context,
+                                 int tensor_idx);
+  static TfLiteEvalTensor* GetEvalTensor(const struct TfLiteContext* context,
+                                         int tensor_idx);

-  void SetNodeIndex(int idx) { current_node_idx_ = idx; }
+  // Sets the current node index to assist with scratch buffer allocations:
+  void SetNodeIndex(int idx);
+
+  // Sets the pointer to a list of TfLiteEvalTensor instances.
+  void SetTfLiteEvalTensors(TfLiteEvalTensor* eval_tensors);

 private:
  MicroAllocator* allocator_;
  ErrorReporter* error_reporter_;
+  const Model* model_;
+  TfLiteEvalTensor* eval_tensors_;
  int current_node_idx_ = -1;
 };

@ -60,17 +72,26 @@ class ContextHelper {

 class MicroInterpreter {
 public:
-  // The lifetime of the model, op resolver, tensor arena, and error reporter
-  // must be at least as long as that of the interpreter object, since the
-  // interpreter may need to access them at any time. This means that you should
-  // usually create them with the same scope as each other, for example having
-  // them all allocated on the stack as local variables through a top-level
-  // function.
-  // The interpreter doesn't do any deallocation of any of the pointed-to
-  // objects, ownership remains with the caller.
-  MicroInterpreter(const Model* model, const OpResolver& op_resolver,
+  // The lifetime of the model, op resolver, tensor arena, error reporter and
+  // profiler must be at least as long as that of the interpreter object, since
+  // the interpreter may need to access them at any time. This means that you
+  // should usually create them with the same scope as each other, for example
+  // having them all allocated on the stack as local variables through a
+  // top-level function. The interpreter doesn't do any deallocation of any of
+  // the pointed-to objects, ownership remains with the caller.
+  MicroInterpreter(const Model* model, const MicroOpResolver& op_resolver,
                   uint8_t* tensor_arena, size_t tensor_arena_size,
-                   ErrorReporter* error_reporter);
+                   ErrorReporter* error_reporter,
+                   tflite::Profiler* profiler = nullptr);
+
+  // Create an interpreter instance using an existing MicroAllocator instance.
+  // This constructor should be used when creating an allocator that needs to
+  // have allocation handled in more than one interpreter or for recording
+  // allocations inside the interpreter. The lifetime of the allocator must be
+  // as long as that of the interpreter object.
+  MicroInterpreter(const Model* model, const MicroOpResolver& op_resolver,
+                   MicroAllocator* allocator, ErrorReporter* error_reporter,
+                   tflite::Profiler* profiler = nullptr);

  ~MicroInterpreter();

@ -147,8 +168,16 @@ class MicroInterpreter {
  // arena_used_bytes() + 16.
  size_t arena_used_bytes() const { return allocator_.used_bytes(); }

+ protected:
+  const MicroAllocator& allocator() const { return allocator_; }
+  const TfLiteContext& context() const { return context_; }
+
 private:
-  void CorrectTensorEndianness(TfLiteTensor* tensorCorr);
+  // TODO(b/158263161): Consider switching to Create() function to enable better
+  // error reporting during initialization.
+  void Init(tflite::Profiler* profiler);
+
+  void CorrectTensorEndianness(TfLiteEvalTensor* tensorCorr);

  template <class T>
  void CorrectTensorDataEndianness(T* data, int32_t size);
@ -156,16 +185,22 @@ class MicroInterpreter {
  NodeAndRegistration* node_and_registrations_ = nullptr;

  const Model* model_;
-  const OpResolver& op_resolver_;
+  const MicroOpResolver& op_resolver_;
  ErrorReporter* error_reporter_;
  TfLiteContext context_ = {};
-  MicroAllocator allocator_;
+  MicroAllocator& allocator_;
  bool tensors_allocated_;

  TfLiteStatus initialization_status_;

  const SubGraph* subgraph_;
+  TfLiteEvalTensor* eval_tensors_;
  internal::ContextHelper context_helper_;
+
+  // TODO(b/160894903): Clean these pointers up when all APIs are updated to new
+  // TfLiteEvalTensor buffers.
+  TfLiteTensor* input_tensor_;
+  TfLiteTensor* output_tensor_;
 };

 }  // namespace tflite
--- a/libs/tensorflow/tensorflow/lite/micro/micro_mutable_op_resolver.h
+++ b/libs/tensorflow/tensorflow/lite/micro/micro_mutable_op_resolver.h
@ -1,4 +1,4 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@ -16,110 +16,441 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_MICRO_MICRO_MUTABLE_OP_RESOLVER_H_
 #define TENSORFLOW_LITE_MICRO_MICRO_MUTABLE_OP_RESOLVER_H_

-#include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/core/api/op_resolver.h"
-#include "tensorflow/lite/micro/compatibility.h"
-#include "tensorflow/lite/schema/schema_generated.h"
+#include <cstdio>
+#include <cstring>

-#ifndef TFLITE_REGISTRATIONS_MAX
-#define TFLITE_REGISTRATIONS_MAX (128)
-#endif
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/api/flatbuffer_conversions.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/micro/compatibility.h"
+#include "tensorflow/lite/micro/kernels/micro_ops.h"
+#include "tensorflow/lite/micro/micro_op_resolver.h"
+#include "tensorflow/lite/schema/schema_generated.h"

 namespace tflite {

-// Op versions discussed in this file are enumerated here:
-// tensorflow/lite/tools/versioning/op_version.cc
-
-inline int MicroOpResolverAnyVersion() { return 0; }
-
-template <unsigned int tOpCount = TFLITE_REGISTRATIONS_MAX>
-class MicroOpResolver : public OpResolver {
+template <unsigned int tOpCount>
+class MicroMutableOpResolver : public MicroOpResolver {
 public:
-  const TfLiteRegistration* FindOp(tflite::BuiltinOperator op,
-                                   int version) const override {
+  explicit MicroMutableOpResolver(ErrorReporter* error_reporter = nullptr)
+      : error_reporter_(error_reporter) {}
+
+  const TfLiteRegistration* FindOp(tflite::BuiltinOperator op) const override {
+    if (op == BuiltinOperator_CUSTOM) return nullptr;
+
    for (unsigned int i = 0; i < registrations_len_; ++i) {
      const TfLiteRegistration& registration = registrations_[i];
-      if ((registration.builtin_code == op) &&
-          (registration.version == MicroOpResolverAnyVersion() ||
-           version == MicroOpResolverAnyVersion() ||
-           registration.version == version)) {
+      if (registration.builtin_code == op) {
        return &registration;
      }
    }
    return nullptr;
  }

-  const TfLiteRegistration* FindOp(const char* op, int version) const override {
+  const TfLiteRegistration* FindOp(const char* op) const override {
    for (unsigned int i = 0; i < registrations_len_; ++i) {
      const TfLiteRegistration& registration = registrations_[i];
      if ((registration.builtin_code == BuiltinOperator_CUSTOM) &&
-          (strcmp(registration.custom_name, op) == 0) &&
-          (registration.version == MicroOpResolverAnyVersion() ||
-           version == MicroOpResolverAnyVersion() ||
-           registration.version == version)) {
+          (strcmp(registration.custom_name, op) == 0)) {
        return &registration;
      }
    }
    return nullptr;
  }

-  void AddBuiltin(tflite::BuiltinOperator op, TfLiteRegistration* registration,
-                  int version = 1) {
-    if (registrations_len_ >= tOpCount) {
-      // TODO(b/147748244) - Add error reporting hooks so we can report this!
-      return;
+  MicroOpResolver::BuiltinParseFunction GetOpDataParser(
+      BuiltinOperator op) const override {
+    TFLITE_DCHECK(num_buitin_ops_ <= tOpCount);
+    for (unsigned int i = 0; i < num_buitin_ops_; ++i) {
+      if (builtin_codes_[i] == op) return builtin_parsers_[i];
    }
-    TfLiteRegistration* new_registration = &registrations_[registrations_len_];
-    registrations_len_ += 1;
-
-    *new_registration = *registration;
-    new_registration->builtin_code = op;
-    new_registration->version = version;
+    return nullptr;
  }

-  void AddBuiltin(tflite::BuiltinOperator op, TfLiteRegistration* registration,
-                  int min_version, int max_version) {
-    for (int version = min_version; version <= max_version; ++version) {
-      AddBuiltin(op, registration, version);
-    }
-  }
-
-  void AddCustom(const char* name, TfLiteRegistration* registration,
-                 int version = 1) {
+  // Registers a Custom Operator with the MicroOpResolver.
+  //
+  // Only the first call for a given name will be successful. i.e. if this
+  // function is called again for a previously added Custom Operator, the
+  // MicroOpResolver will be unchanged and this function will return
+  // kTfLiteError.
+  TfLiteStatus AddCustom(const char* name, TfLiteRegistration* registration) {
    if (registrations_len_ >= tOpCount) {
-      // TODO(b/147748244) - Add error reporting hooks so we can report this!
-      return;
+      if (error_reporter_) {
+        TF_LITE_REPORT_ERROR(
+            error_reporter_,
+            "Couldn't register custom op '%s', resolver size is too small (%d)",
+            name, tOpCount);
+      }
+      return kTfLiteError;
    }
+
+    if (FindOp(name) != nullptr) {
+      if (error_reporter_ != nullptr) {
+        TF_LITE_REPORT_ERROR(error_reporter_,
+                             "Calling AddCustom for the same op more than once "
+                             "is not supported (Op: %s).",
+                             name);
+      }
+      return kTfLiteError;
+    }
+
    TfLiteRegistration* new_registration = &registrations_[registrations_len_];
    registrations_len_ += 1;

    *new_registration = *registration;
    new_registration->builtin_code = BuiltinOperator_CUSTOM;
    new_registration->custom_name = name;
-    new_registration->version = version;
+    return kTfLiteOk;
  }

-  void AddCustom(const char* name, TfLiteRegistration* registration,
-                 int min_version, int max_version) {
-    for (int version = min_version; version <= max_version; ++version) {
-      AddCustom(name, registration, version);
-    }
+  // The Add* functions below add the various Builtin operators to the
+  // MicroMutableOpResolver object.
+
+  TfLiteStatus AddAbs() {
+    return AddBuiltin(BuiltinOperator_ABS, tflite::ops::micro::Register_ABS(),
+                      ParseAbs);
+  }
+
+  TfLiteStatus AddAdd() {
+    return AddBuiltin(BuiltinOperator_ADD, tflite::ops::micro::Register_ADD(),
+                      ParseAdd);
+  }
+
+  TfLiteStatus AddArgMax() {
+    return AddBuiltin(BuiltinOperator_ARG_MAX,
+                      tflite::ops::micro::Register_ARG_MAX(), ParseArgMax);
+  }
+
+  TfLiteStatus AddArgMin() {
+    return AddBuiltin(BuiltinOperator_ARG_MIN,
+                      tflite::ops::micro::Register_ARG_MIN(), ParseArgMin);
+  }
+
+  TfLiteStatus AddAveragePool2D() {
+    return AddBuiltin(BuiltinOperator_AVERAGE_POOL_2D,
+                      tflite::ops::micro::Register_AVERAGE_POOL_2D(),
+                      ParsePool);
+  }
+
+  TfLiteStatus AddCeil() {
+    return AddBuiltin(BuiltinOperator_CEIL, tflite::ops::micro::Register_CEIL(),
+                      ParseCeil);
+  }
+
+  TfLiteStatus AddCircularBuffer() {
+    return AddCustom("CIRCULAR_BUFFER",
+                     tflite::ops::micro::Register_CIRCULAR_BUFFER());
+  }
+
+  TfLiteStatus AddConcatenation() {
+    return AddBuiltin(BuiltinOperator_CONCATENATION,
+                      tflite::ops::micro::Register_CONCATENATION(),
+                      ParseConcatenation);
+  }
+
+  TfLiteStatus AddConv2D() {
+    return AddBuiltin(BuiltinOperator_CONV_2D,
+                      tflite::ops::micro::Register_CONV_2D(), ParseConv2D);
+  }
+
+  TfLiteStatus AddCos() {
+    return AddBuiltin(BuiltinOperator_COS, tflite::ops::micro::Register_COS(),
+                      ParseCos);
+  }
+
+  TfLiteStatus AddDepthwiseConv2D() {
+    return AddBuiltin(BuiltinOperator_DEPTHWISE_CONV_2D,
+                      tflite::ops::micro::Register_DEPTHWISE_CONV_2D(),
+                      ParseDepthwiseConv2D);
+  }
+
+  TfLiteStatus AddDequantize() {
+    return AddBuiltin(BuiltinOperator_DEQUANTIZE,
+                      tflite::ops::micro::Register_DEQUANTIZE(),
+                      ParseDequantize);
+  }
+
+  TfLiteStatus AddEqual() {
+    return AddBuiltin(BuiltinOperator_EQUAL,
+                      tflite::ops::micro::Register_EQUAL(), ParseEqual);
+  }
+
+  TfLiteStatus AddFloor() {
+    return AddBuiltin(BuiltinOperator_FLOOR,
+                      tflite::ops::micro::Register_FLOOR(), ParseFloor);
+  }
+
+  TfLiteStatus AddFullyConnected() {
+    return AddBuiltin(BuiltinOperator_FULLY_CONNECTED,
+                      tflite::ops::micro::Register_FULLY_CONNECTED(),
+                      ParseFullyConnected);
+  }
+
+  TfLiteStatus AddGreater() {
+    return AddBuiltin(BuiltinOperator_GREATER,
+                      tflite::ops::micro::Register_GREATER(), ParseGreater);
+  }
+
+  TfLiteStatus AddGreaterEqual() {
+    return AddBuiltin(BuiltinOperator_GREATER_EQUAL,
+                      tflite::ops::micro::Register_GREATER_EQUAL(),
+                      ParseGreaterEqual);
+  }
+
+  TfLiteStatus AddHardSwish() {
+    return AddBuiltin(BuiltinOperator_HARD_SWISH,
+                      tflite::ops::micro::Register_HARD_SWISH(),
+                      ParseHardSwish);
+  }
+
+  TfLiteStatus AddL2Normalization() {
+    return AddBuiltin(BuiltinOperator_L2_NORMALIZATION,
+                      tflite::ops::micro::Register_L2_NORMALIZATION(),
+                      ParseL2Normalization);
+  }
+
+  TfLiteStatus AddLess() {
+    return AddBuiltin(BuiltinOperator_LESS, tflite::ops::micro::Register_LESS(),
+                      ParseLess);
+  }
+
+  TfLiteStatus AddLessEqual() {
+    return AddBuiltin(BuiltinOperator_LESS_EQUAL,
+                      tflite::ops::micro::Register_LESS_EQUAL(),
+                      ParseLessEqual);
+  }
+
+  TfLiteStatus AddLog() {
+    return AddBuiltin(BuiltinOperator_LOG, tflite::ops::micro::Register_LOG(),
+                      ParseLog);
+  }
+
+  TfLiteStatus AddLogicalAnd() {
+    return AddBuiltin(BuiltinOperator_LOGICAL_AND,
+                      tflite::ops::micro::Register_LOGICAL_AND(),
+                      ParseLogicalAnd);
+  }
+
+  TfLiteStatus AddLogicalNot() {
+    return AddBuiltin(BuiltinOperator_LOGICAL_NOT,
+                      tflite::ops::micro::Register_LOGICAL_NOT(),
+                      ParseLogicalNot);
+  }
+
+  TfLiteStatus AddLogicalOr() {
+    return AddBuiltin(BuiltinOperator_LOGICAL_OR,
+                      tflite::ops::micro::Register_LOGICAL_OR(),
+                      ParseLogicalOr);
+  }
+
+  TfLiteStatus AddLogistic() {
+    return AddBuiltin(BuiltinOperator_LOGISTIC,
+                      tflite::ops::micro::Register_LOGISTIC(), ParseLogistic);
+  }
+
+  TfLiteStatus AddMaximum() {
+    return AddBuiltin(BuiltinOperator_MAXIMUM,
+                      tflite::ops::micro::Register_MAXIMUM(), ParseMaximum);
+  }
+
+  TfLiteStatus AddMaxPool2D() {
+    return AddBuiltin(BuiltinOperator_MAX_POOL_2D,
+                      tflite::ops::micro::Register_MAX_POOL_2D(), ParsePool);
+  }
+
+  TfLiteStatus AddMean() {
+    return AddBuiltin(BuiltinOperator_MEAN, tflite::ops::micro::Register_MEAN(),
+                      ParseReducer);
+  }
+
+  TfLiteStatus AddMinimum() {
+    return AddBuiltin(BuiltinOperator_MINIMUM,
+                      tflite::ops::micro::Register_MINIMUM(), ParseMinimum);
+  }
+
+  TfLiteStatus AddMul() {
+    return AddBuiltin(BuiltinOperator_MUL, tflite::ops::micro::Register_MUL(),
+                      ParseMul);
+  }
+
+  TfLiteStatus AddNeg() {
+    return AddBuiltin(BuiltinOperator_NEG, tflite::ops::micro::Register_NEG(),
+                      ParseNeg);
+  }
+
+  TfLiteStatus AddNotEqual() {
+    return AddBuiltin(BuiltinOperator_NOT_EQUAL,
+                      tflite::ops::micro::Register_NOT_EQUAL(), ParseNotEqual);
+  }
+
+  TfLiteStatus AddPack() {
+    return AddBuiltin(BuiltinOperator_PACK, tflite::ops::micro::Register_PACK(),
+                      ParsePack);
+  }
+
+  TfLiteStatus AddPad() {
+    return AddBuiltin(BuiltinOperator_PAD, tflite::ops::micro::Register_PAD(),
+                      ParsePad);
+  }
+
+  TfLiteStatus AddPadV2() {
+    return AddBuiltin(BuiltinOperator_PADV2,
+                      tflite::ops::micro::Register_PADV2(), ParsePadV2);
+  }
+
+  TfLiteStatus AddPrelu() {
+    return AddBuiltin(BuiltinOperator_PRELU,
+                      tflite::ops::micro::Register_PRELU(), ParsePrelu);
+  }
+
+  TfLiteStatus AddQuantize() {
+    return AddBuiltin(BuiltinOperator_QUANTIZE,
+                      tflite::ops::micro::Register_QUANTIZE(), ParseQuantize);
+  }
+
+  TfLiteStatus AddRelu() {
+    return AddBuiltin(BuiltinOperator_RELU, tflite::ops::micro::Register_RELU(),
+                      ParseRelu);
+  }
+
+  TfLiteStatus AddRelu6() {
+    return AddBuiltin(BuiltinOperator_RELU6,
+                      tflite::ops::micro::Register_RELU6(), ParseRelu6);
+  }
+
+  TfLiteStatus AddReshape() {
+    return AddBuiltin(BuiltinOperator_RESHAPE,
+                      tflite::ops::micro::Register_RESHAPE(), ParseReshape);
+  }
+
+  TfLiteStatus AddResizeNearestNeighbor() {
+    return AddBuiltin(BuiltinOperator_RESIZE_NEAREST_NEIGHBOR,
+                      tflite::ops::micro::Register_RESIZE_NEAREST_NEIGHBOR(),
+                      ParseResizeNearestNeighbor);
+  }
+
+  TfLiteStatus AddRound() {
+    return AddBuiltin(BuiltinOperator_ROUND,
+                      tflite::ops::micro::Register_ROUND(), ParseRound);
+  }
+
+  TfLiteStatus AddRsqrt() {
+    return AddBuiltin(BuiltinOperator_RSQRT,
+                      tflite::ops::micro::Register_RSQRT(), ParseRsqrt);
+  }
+
+  TfLiteStatus AddSin() {
+    return AddBuiltin(BuiltinOperator_SIN, tflite::ops::micro::Register_SIN(),
+                      ParseSin);
+  }
+
+  TfLiteStatus AddSoftmax() {
+    return AddBuiltin(BuiltinOperator_SOFTMAX,
+                      tflite::ops::micro::Register_SOFTMAX(), ParseSoftmax);
+  }
+
+  TfLiteStatus AddSplit() {
+    return AddBuiltin(BuiltinOperator_SPLIT,
+                      tflite::ops::micro::Register_SPLIT(), ParseSplit);
+  }
+
+  TfLiteStatus AddSqrt() {
+    return AddBuiltin(BuiltinOperator_SQRT, tflite::ops::micro::Register_SQRT(),
+                      ParseSqrt);
+  }
+
+  TfLiteStatus AddSquare() {
+    return AddBuiltin(BuiltinOperator_SQUARE,
+                      tflite::ops::micro::Register_SQUARE(), ParseSquare);
+  }
+
+  TfLiteStatus AddStridedSlice() {
+    return AddBuiltin(BuiltinOperator_STRIDED_SLICE,
+                      tflite::ops::micro::Register_STRIDED_SLICE(),
+                      ParseStridedSlice);
+  }
+
+  TfLiteStatus AddSub() {
+    return AddBuiltin(BuiltinOperator_SUB, tflite::ops::micro::Register_SUB(),
+                      ParseSub);
+  }
+
+  TfLiteStatus AddSvdf() {
+    return AddBuiltin(BuiltinOperator_SVDF, tflite::ops::micro::Register_SVDF(),
+                      ParseSvdf);
+  }
+
+  TfLiteStatus AddTanh() {
+    return AddBuiltin(BuiltinOperator_TANH, tflite::ops::micro::Register_TANH(),
+                      ParseTanh);
+  }
+
+  TfLiteStatus AddUnpack() {
+    return AddBuiltin(BuiltinOperator_UNPACK,
+                      tflite::ops::micro::Register_UNPACK(), ParseUnpack);
  }

  unsigned int GetRegistrationLength() { return registrations_len_; }

 private:
+  TfLiteStatus AddBuiltin(tflite::BuiltinOperator op,
+                          const TfLiteRegistration& registration,
+                          MicroOpResolver::BuiltinParseFunction parser) {
+    if (op == BuiltinOperator_CUSTOM) {
+      if (error_reporter_ != nullptr) {
+        TF_LITE_REPORT_ERROR(error_reporter_,
+                             "Invalid parameter BuiltinOperator_CUSTOM to the "
+                             "AddBuiltin function.");
+      }
+      return kTfLiteError;
+    }
+
+    if (FindOp(op) != nullptr) {
+      if (error_reporter_ != nullptr) {
+        TF_LITE_REPORT_ERROR(error_reporter_,
+                             "Calling AddBuiltin with the same op more than "
+                             "once is not supported (Op: #%d).",
+                             op);
+      }
+      return kTfLiteError;
+    }
+
+    if (registrations_len_ >= tOpCount) {
+      if (error_reporter_) {
+        TF_LITE_REPORT_ERROR(error_reporter_,
+                             "Couldn't register builtin op #%d, resolver size "
+                             "is too small (%d).",
+                             op, tOpCount);
+      }
+      return kTfLiteError;
+    }
+
+    registrations_[registrations_len_] = registration;
+    // Strictly speaking, the builtin_code is not necessary for TFLM but filling
+    // it in regardless.
+    registrations_[registrations_len_].builtin_code = op;
+    registrations_len_++;
+
+    builtin_codes_[num_buitin_ops_] = op;
+    builtin_parsers_[num_buitin_ops_] = parser;
+    num_buitin_ops_++;
+
+    return kTfLiteOk;
+  }
+
  TfLiteRegistration registrations_[tOpCount];
  unsigned int registrations_len_ = 0;
- public:
-  TF_LITE_REMOVE_VIRTUAL_DELETE
-};

-// TODO(b/147854028): Consider switching all uses of MicroMutableOpResolver to
-// MicroOpResolver.
-class MicroMutableOpResolver
-    : public MicroOpResolver<TFLITE_REGISTRATIONS_MAX> {
- private:
+  // Arrays (and counter) to store the builtin codes and their corresponding
+  // parse functions as these are registered with the Op Resolver.
+  BuiltinOperator builtin_codes_[tOpCount];
+  MicroOpResolver::BuiltinParseFunction builtin_parsers_[tOpCount];
+  unsigned int num_buitin_ops_ = 0;
+
+  ErrorReporter* error_reporter_;
+ public:
  TF_LITE_REMOVE_VIRTUAL_DELETE
 };

--- a/libs/tensorflow/tensorflow/lite/micro/micro_optional_debug_tools.cc
+++ b/libs/tensorflow/tensorflow/lite/micro/micro_optional_debug_tools.cc
@ -20,8 +20,18 @@ limitations under the License.
 #endif

 #include <cinttypes>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <vector>

+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/micro/memory_helpers.h"
+#include "tensorflow/lite/micro/micro_allocator.h"
+#include "tensorflow/lite/micro/micro_interpreter.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+
 namespace tflite {
 namespace {

@ -75,6 +85,8 @@ const char* TensorTypeName(TfLiteType type) {
      return "kTfLiteInt16";
    case kTfLiteComplex64:
      return "kTfLiteComplex64";
+    case kTfLiteComplex128:
+      return "kTfLiteComplex128";
    case kTfLiteFloat16:
      return "kTfLiteFloat16";
    case kTfLiteFloat64:
@ -95,11 +107,44 @@ const char* AllocTypeName(TfLiteAllocationType type) {
      return "kTfLiteArenaRw";
    case kTfLiteArenaRwPersistent:
      return "kTfLiteArenaRwPersistent";
+    case kTfLitePersistentRo:
+      return "kTfLitePersistentRo";
+    case kTfLiteCustom:
+      return "kTfLiteCustom";
  }
  return "(invalid)";
 }
 }  // namespace

+// Helper function to print model flatbuffer data. This function is not called
+// by default. Hence it's not linked in to the final binary code.
+void PrintModelData(const Model* model, ErrorReporter* error_reporter) {
+#ifndef TF_LITE_STRIP_ERROR_STRINGS
+  auto* subgraphs = model->subgraphs();
+  const SubGraph* subgraph = (*subgraphs)[0];
+  const flatbuffers::Vector<flatbuffers::Offset<Tensor>>* tensors =
+      subgraph->tensors();
+  const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* buffers =
+      model->buffers();
+  TF_LITE_REPORT_ERROR(error_reporter, "==== Model info: =====");
+  for (size_t i = 0; i < tensors->size(); ++i) {
+    const tflite::Tensor& flatbuffer_tensor = *tensors->Get(i);
+    size_t type_size, tensor_size;
+    auto* buffer = (*buffers)[flatbuffer_tensor.buffer()];
+    auto* array = buffer->data();
+    int array_size = 0;
+    if (array) {
+      array_size = array->size();
+    }
+    BytesRequiredForTensor(flatbuffer_tensor, &tensor_size, &type_size,
+                           error_reporter);
+    TF_LITE_REPORT_ERROR(
+        error_reporter, "Tensor index: %d arena tensor %d size %d ", i,
+        !array_size && !flatbuffer_tensor.is_variable(), tensor_size);
+  }
+#endif
+}
+
 // Prints a dump of what tensors and what nodes are in the interpreter.
 void PrintInterpreterState(MicroInterpreter* interpreter) {
  printf("Interpreter has %zu tensors and %zu nodes\n",
@ -113,10 +158,9 @@ void PrintInterpreterState(MicroInterpreter* interpreter) {
  for (size_t tensor_index = 0; tensor_index < interpreter->tensors_size();
       tensor_index++) {
    TfLiteTensor* tensor = interpreter->tensor(static_cast<int>(tensor_index));
-    printf("Tensor %3zu %-20s %10s %15s %10zu bytes (%4.1f MB) ", tensor_index,
-           tensor->name, TensorTypeName(tensor->type),
-           AllocTypeName(tensor->allocation_type), tensor->bytes,
-           static_cast<double>(tensor->bytes / (1 << 20)));
+    printf("Tensor %3zu %10s %15s %10zu bytes (%4.1f MB) ", tensor_index,
+           TensorTypeName(tensor->type), AllocTypeName(tensor->allocation_type),
+           tensor->bytes, static_cast<double>(tensor->bytes / (1 << 20)));
    PrintTfLiteIntVector(tensor->dims);
  }
  printf("\n");
--- a/libs/tensorflow/tensorflow/lite/micro/micro_optional_debug_tools.h
+++ b/libs/tensorflow/tensorflow/lite/micro/micro_optional_debug_tools.h
@ -20,6 +20,9 @@ limitations under the License.
 #include "tensorflow/lite/micro/micro_interpreter.h"

 namespace tflite {
+// Helper function to print model flatbuffer data. This function is not called
+// by default. Hence it's not linked in to the final binary code.
+void PrintModelData(const Model* model, ErrorReporter* error_reporter);
 // Prints a dump of what tensors and what nodes are in the interpreter.
 void PrintInterpreterState(MicroInterpreter* interpreter);
 }  // namespace tflite
--- a/libs/tensorflow/tensorflow/lite/micro/micro_string.cc
+++ b/libs/tensorflow/tensorflow/lite/micro/micro_string.cc
@ -23,6 +23,7 @@ limitations under the License.

 #include <cstdarg>
 #include <cstdint>
+#include <cstring>

 namespace {

@ -125,7 +126,8 @@ char* FastFloatToBufferLeft(float f, char* buffer) {
  const int32_t exponent_shift = 23;
  const int32_t exponent_bias = 127;
  const uint32_t fraction_mask = 0x007fffff;
-  const uint32_t u = *reinterpret_cast<uint32_t*>(&f);
+  uint32_t u;
+  memcpy(&u, &f, sizeof(int32_t));
  const int32_t exponent =
      ((u & exponent_mask) >> exponent_shift) - exponent_bias;
  const uint32_t fraction = (u & fraction_mask);
@ -163,7 +165,47 @@ char* FastFloatToBufferLeft(float f, char* buffer) {
  *current = '.';
  current += 1;
  *current = 0;
+
+  // Prepend leading zeros to fill in all 7 bytes of the fraction. Truncate
+  // zeros off the end of the fraction. Every fractional value takes 7 bytes.
+  // For example, 2500 would be written into the buffer as 0002500 since it
+  // represents .00025.
+  constexpr int kMaxFractionalDigits = 7;
+
+  // Abort early if there is not enough space in the buffer.
+  if (current_end - current <= kMaxFractionalDigits) {
+    return current;
+  }
+
+  // Pre-fill buffer with zeros to ensure zero-truncation works properly.
+  for (int i = 1; i < kMaxFractionalDigits; i++) {
+    *(current + i) = '0';
+  }
+
+  // Track how large the fraction is to add leading zeros.
+  char* previous = current;
  current = StrCatUInt32(current, (current_end - current), scaled_fraction, 10);
+  int fraction_digits = current - previous;
+  int leading_zeros = kMaxFractionalDigits - fraction_digits;
+
+  // Overwrite the null terminator from StrCatUInt32 to ensure zero-trunctaion
+  // works properly.
+  *current = '0';
+
+  // Shift fraction values and prepent zeros.
+  for (int i = 0; i < fraction_digits; i++) {
+    current--;
+    *(current + leading_zeros) = *current;
+    *current = '0';
+  }
+  current += kMaxFractionalDigits;
+
+  // Truncate trailing zeros for cleaner logs. Ensure we leave at least one
+  // fractional character for the case when scaled_fraction is 0.
+  while (*(current - 1) == '0' && (current - 1) > previous) {
+    current--;
+  }
+  *current = 0;
  current = StrCatStr(current, (current_end - current), "*2^");
  current = StrCatInt32(current, (current_end - current), exponent);
  return current;
--- a/Show More
+++ b/Show More