earthengine-api/python/ee/serializer.py
Google Earth Engine Authors 1baca8818e v0.1.369
PiperOrigin-RevId: 563532158
2023-09-14 00:34:19 +00:00

599 lines
22 KiB
Python

#!/usr/bin/env python3
"""A serializer that encodes EE object trees as JSON DAGs."""
import collections
import datetime
import hashlib
import json
import numbers
from typing import Any, Dict, List, Optional, Set
from ee import _cloud_api_utils
from ee import ee_exception
from ee import encodable
# The datetime for the beginning of the Unix epoch.
_EPOCH_DATETIME = datetime.datetime.fromtimestamp(0, datetime.timezone.utc)
# Don't generate very deep expressions, as the backend rejects them.
# The backend's limit is 100, and we want to stay well away from that
# as a few extra levels of wrapping are always added.
_DEPTH_LIMIT = 50
# pylint: disable-next=g-bad-name
def DatetimeToMicroseconds(date: datetime.datetime) -> int:
"""Convert a datetime to a timestamp, microseconds since the epoch."""
if date.tzinfo is None:
# Assume that the time is in utc.
date = date.replace(tzinfo=datetime.timezone.utc)
td = (date - _EPOCH_DATETIME)
return td.microseconds + (td.seconds + td.days * 24 * 3600) * 1000000
class Serializer:
"""A serializer for EE object trees."""
unbound_name: Optional[str]
# Whether the encoding should factor out shared subtrees.
_is_compound: bool
_for_cloud_api: bool
# A list of shared subtrees as [name, value] pairs.
_scope: List[str]
# A lookup table from object hash to subtree names as stored in self._scope
_encoded: Dict[Any, Any]
# A lookup table from object ID as retrieved by id() to md5 hash values.
_hashcache: Dict[Any, Any]
def __init__(
self,
is_compound: bool = True,
for_cloud_api: bool = False,
unbound_name: Optional[str] = None,
):
"""Constructs a serializer.
Args:
is_compound: Whether the encoding should factor out shared subtrees.
for_cloud_api: Whether the encoding should be done for the Cloud API or
the legacy API.
unbound_name: Provides a name for unbound variables in objects.
"""
self.unbound_name = unbound_name
self._is_compound = is_compound
self._for_cloud_api = for_cloud_api
self._scope = []
self._encoded = {}
self._hashcache = {}
def _encode(self, obj: Any) -> Any:
"""Encodes a top level object to be executed server-side.
Args:
obj: The object to encode.
Returns:
An encoded object ready for JSON serialization.
"""
if self._for_cloud_api:
return self._encode_for_cloud_api(obj)
value = self._encode_value(obj)
if self._is_compound:
if (isinstance(value, dict) and value['type'] == 'ValueRef' and
len(self._scope) == 1):
# Just one value. No need for complex structure.
value = self._scope[0][1]
else:
# Wrap the scopes and final value with a CompoundValue.
value = {'type': 'CompoundValue', 'scope': self._scope, 'value': value}
# Clear state in case of future encoding.
self._scope = []
self._encoded = {}
self._hashcache = {}
return value
def _encode_for_cloud_api(self, obj: Any) -> Any:
"""Encodes an object as an Expression or quasi-Expression."""
value = self._encode_cloud_object(obj)
if self._is_compound:
# Wrap the scopes and final value into an Expression.
value = _ExpressionOptimizer(value, self._scope).optimize()
# Clear state in case of future encoding.
self._scope = []
self._encoded = {}
self._hashcache = {}
else:
value = _ExpressionOptimizer(value).optimize()
return value
def _encode_value(self, obj: Any) -> Any:
"""Encodes a subtree as a Value in the EE API v2 (DAG) format.
If _is_compound is True, this will fill the _scope and _encoded properties.
Args:
obj: The object to encode.
Returns:
An encoded object.
"""
obj_id = id(obj)
hashval = self._hashcache.get(obj_id)
encoded = self._encoded.get(hashval, None)
if self._is_compound and encoded:
# Already encoded objects are encoded as ValueRefs and returned directly.
return {'type': 'ValueRef', 'value': encoded}
elif obj is None or isinstance(obj, (bool, numbers.Number, str)):
# Primitives are encoded as is and not saved in the scope.
return obj
elif isinstance(obj, datetime.datetime):
# A raw date slipped through. Wrap it. Calling ee.Date from here would
# cause a circular dependency, so we encode it manually.
return {
'type': 'Invocation',
'functionName': 'Date',
'arguments': {
'value': DatetimeToMicroseconds(obj) / 1e3
}
}
elif isinstance(obj, encodable.Encodable):
# Some objects know how to encode themselves.
result = obj.encode(self._encode_value)
if (not isinstance(result, (list, tuple)) and
(not isinstance(result, (dict)) or result['type'] == 'ArgumentRef')):
# Optimization: simple enough that adding it to the scope is probably
# not worth it.
return result
elif isinstance(obj, encodable.EncodableFunction):
result = obj.encode_invocation(self._encode_value)
if (not isinstance(result, (list, tuple)) and
(not isinstance(result, (dict)) or result['type'] == 'ArgumentRef')):
# Optimization: simple enough that adding it to the scope is probably
# not worth it.
return result
elif isinstance(obj, (list, tuple)):
# Lists are encoded recursively.
result = [self._encode_value(i) for i in obj]
elif isinstance(obj, dict):
# Dictionary are encoded recursively and wrapped in a type specifier.
result = {
'type':
'Dictionary',
'value':
dict([(key, self._encode_value(value))
for key, value in obj.items()])
}
else:
raise ee_exception.EEException('Cannot encode object: %s' % obj)
if self._is_compound:
# Save the new object and return a ValueRef.
hashval = hashlib.md5(json.dumps(result).encode()).digest()
self._hashcache[obj_id] = hashval
name = self._encoded.get(hashval, None)
if not name:
name = str(len(self._scope))
self._scope.append((name, result))
self._encoded[hashval] = name
return {'type': 'ValueRef', 'value': name}
else:
return result
def _encode_cloud_object(self, obj: Any) -> Any:
"""Encodes an object using the Cloud API Expression form.
If _is_compound is True, this will fill the _scope and _encoded properties.
Args:
obj: The object to encode.
Returns:
If _is_compound is True, a string that is the key under which the
encoded object is stored in _scope.
If _is_compound is False, the encoded object as a single quasi-Expression.
"""
obj_id = id(obj)
hashval = self._hashcache.get(obj_id)
reference = self._encoded.get(hashval, None)
if reference:
return reference
elif obj is None or isinstance(obj, (bool, str)):
result = {'constantValue': obj}
elif isinstance(obj, numbers.Number):
result = _cloud_api_utils.encode_number_as_cloud_value(obj)
elif isinstance(obj, datetime.datetime):
# A raw date slipped through. Wrap it. Calling ee.Date from here would
# cause a circular dependency, so we encode it manually.
result = {
'functionInvocationValue': {
'functionName': 'Date',
'arguments': {
'value': {
'constantValue': DatetimeToMicroseconds(obj) / 1e3
}
}
}
}
elif isinstance(obj, encodable.Encodable):
# Some objects know how to encode themselves.
result = obj.encode_cloud_value(self._encode_cloud_object)
elif isinstance(obj, (list, tuple)):
# Lists are encoded recursively.
if self._is_compound:
result = {
'arrayValue': {
'values': [{
'valueReference': self._encode_cloud_object(i)
} for i in obj]
}
}
else:
result = {
'arrayValue': {
'values': [self._encode_cloud_object(i) for i in obj]
}
}
elif isinstance(obj, dict):
# Dictionary are encoded recursively and wrapped in a type specifier.
# We iterate through the entries in a deterministic order, not because it
# affects the order of the entries in the output result, but because it
# affects the names that they are assigned in _scope; without the
# ordering, the encoding process may produce one of multiple different
# (albeit equivalent) representations.
if self._is_compound:
result = {
'dictionaryValue': {
'values': {
key: {
'valueReference': self._encode_cloud_object(obj[key])
} for key in sorted(obj)
}
}
}
else:
result = {
'dictionaryValue': {
'values': {
key: self._encode_cloud_object(obj[key])
for key in sorted(obj)
}
}
}
else:
raise ee_exception.EEException('Cannot encode object: %s' % obj)
if self._is_compound:
# Save the new object and return a ValueRef.
hashval = hashlib.md5(json.dumps(result).encode()).digest()
self._hashcache[obj_id] = hashval
name = self._encoded.get(hashval, None)
if not name:
name = str(len(self._scope))
self._scope.append((name, result))
self._encoded[hashval] = name
return name
else:
return result
def encode(
obj: Any,
is_compound: bool = True,
for_cloud_api: bool = True,
unbound_name: Optional[str] = None,
) -> Any:
"""Serialize an object to a JSON-compatible structure for API calls.
Args:
obj: The object to serialize.
is_compound: Whether the encoding should factor out shared subtrees.
for_cloud_api: Whether the encoding should be done for the Cloud API or the
legacy API.
unbound_name: Provides a name for unbound variables in objects. Unbound
variables are otherwise disallowed. See the Count Functions usage in
customfunction.py.
Returns:
A JSON-compatible structure representing the input.
"""
serializer = Serializer(
is_compound, for_cloud_api=for_cloud_api, unbound_name=unbound_name)
return serializer._encode(obj) # pylint: disable=protected-access
# pylint: disable-next=g-bad-name
def toJSON(obj, opt_pretty: bool = False, for_cloud_api: bool = True) -> Any:
"""Serialize an object to a JSON string appropriate for API calls.
Args:
obj: The object to serialize.
opt_pretty: True to pretty-print the object.
for_cloud_api: Whether the encoding should be done for the Cloud API or the
legacy API.
Returns:
A JSON string representing the input.
"""
serializer = Serializer(not opt_pretty, for_cloud_api=for_cloud_api)
encoded = serializer._encode(obj) # pylint: disable=protected-access
return json.dumps(encoded, indent=2 if opt_pretty else None)
# pylint: disable-next=g-bad-name
def toReadableJSON(obj: Any, for_cloud_api: bool = True) -> Any:
"""Convert an object to readable JSON."""
return toJSON(obj, True, for_cloud_api=for_cloud_api)
class _ExpressionOptimizer:
"""Optimises the representation of an Expression.
The Expressions generated by recursive encoding can be inefficiently
represented. This class helps improve the representation.
The initial representation is intentionally simple, as it makes the encoding
logic simple. Constants end up as individual ValueNodes, though the Expression
format itself allows complex constants (nested arrays and/or dicts containing
constant values). There are also often places where references to ValueNodes
can be replaced by direct inclusion of those ValueNodes.
This operates in two modes:
- It can be passed an Expression as a dict of named ValueNodes, and the name
that represents the final result. In this case, it returns the optimised
Expression in the same form. This is the "compound" mode.
- It can be passed a quasi-Expression as a single object. In this case, it
returns the optimised quasi-Expression in the same form. This is the
"non-compound" mode. A "quasi-Expression" is essentially an Expression DAG
that's been expanded to a tree by replacing references with the actual thing
being referenced. This means that if the same entity is referenced more than
once, it will be duplicated in the tree.
The rules that the optimiser follows are straightforward:
- If a value is referred to only once, lift it into the place that references
it.
- If a value is a numeric or boolean constant, lift it into all the places
that reference it.
- If a value is a string constant, lift it if it is referred to only once.
- Collapse dicts and arrays of constants to constant dicts/arrays.
"""
def __init__(self, result: Any, values: Optional[Any] = None):
"""Builds an ExpressionOptimizer.
Args:
result: The result to optimize, either as a key of "values", or as a
quasi-Expression.
values: If provided (in compound mode), a set of named ValueNodes.
"""
self._result = result
# Convert sequence of tuples to a dict allowing lookup.
# Values will be of the form:
# [('0', {'constantValue': 99}), ('1', {'constantValue': 98}), ...]
self._values = dict(values) if values is not None else None
if self._is_compound():
self._single_uses = self._find_single_uses()
self._optimized_values = {}
self._reference_map = {}
def _is_compound(self) -> bool:
return self._values is not None
def _find_single_uses(self) -> Set[Any]:
"""Finds the names of all named values that are referred to only once."""
reference_counts = collections.defaultdict(int)
reference_counts[self._result] += 1
def _contained_reference(value: Any) -> Optional[Any]:
"""Gets a contained reference from a ValueNode, if there is one."""
if 'functionDefinitionValue' in value:
return value['functionDefinitionValue']['body']
elif 'functionInvocationValue' in value:
function_invocation = value['functionInvocationValue']
if 'functionReference' in function_invocation:
return function_invocation['functionReference']
elif 'valueReference' in value:
return value['valueReference']
return None
def increment_reference_count(value: Any) -> None:
reference = _contained_reference(value)
if reference is not None:
reference_counts[reference] += 1
self._visit_all_values_in_expression(increment_reference_count)
return set(reference for reference, count in reference_counts.items()
if count == 1)
def optimize(self) -> Any:
"""Optimises the expression, returning the optimised form."""
optimized_result = self._optimize_referred_value(self._result)
if self._is_compound():
return {'result': optimized_result, 'values': self._optimized_values}
else:
return optimized_result
def _optimize_referred_value(self, reference_or_value: Any) -> Any:
"""Recursively optimises a value.
Optimises a value and everything recursively reachable from it.
This operates differently depending on the mode.
In compound mode:
Takes a name (in _values) for a ValueNode, optimises the referenced
ValueNode, and returns a name (in _optimized_values) for the optimised
ValueNode. Updates _optimized_values and _reference_map.
In non-compound mode:
Takes a quasi-ValueNode, optimises it, and returns the optimised
quasi-ValueNode.
Args:
reference_or_value: The name in _values of the value to optimise, or the
actual value itself.
Returns:
The name, in _optimized_values, of the optimised value, or the optimised
value itself.
"""
if self._is_compound():
if reference_or_value in self._reference_map:
return self._reference_map[reference_or_value]
mapped_reference = str(len(self._reference_map))
self._reference_map[reference_or_value] = mapped_reference
self._optimized_values[mapped_reference] = self._optimize_value(
self._values[reference_or_value], 0)
return mapped_reference
else:
return self._optimize_value(reference_or_value, 0)
def _optimize_value(self, value: Any, depth: int) -> Any:
"""Optimises a single value.
Args:
value: The ValueNode to optimise, in dict form.
depth: How deep in the encoded output this value will be placed.
Returns:
An optimised version of that value, created by lifting in all feasible
constants and references, subject (in compound mode) to a depth limit.
"""
if any(
x in value for x in
['constantValue', 'integerValue', 'bytesValue', 'argumentReference']):
# Not optimisable.
return value
elif 'arrayValue' in value:
# Optimise recursively, then turn an array of constants into a constant
# array.
optimized_array = [
self._optimize_value(array_value, depth + 3)
for array_value in value['arrayValue']['values']
]
if all(self._is_constant_value(v) for v in optimized_array):
optimized_array = [v['constantValue'] for v in optimized_array]
return {'constantValue': optimized_array}
else:
return {'arrayValue': {'values': optimized_array}}
elif 'dictionaryValue' in value:
# Optimise recursively, then turn a dict of constants into a constant
# dict.
optimized_dict = {
key: self._optimize_value(dict_value, depth + 3)
for key, dict_value in value['dictionaryValue']['values'].items()
}
if all(self._is_constant_value(v) for v in optimized_dict.values()):
optimized_dict = {
k: v['constantValue'] for k, v in optimized_dict.items()
}
return {'constantValue': optimized_dict}
else:
return {'dictionaryValue': {'values': optimized_dict}}
elif 'functionDefinitionValue' in value:
function_definition = value['functionDefinitionValue']
return {
'functionDefinitionValue': {
'argumentNames': function_definition['argumentNames'],
'body': self._optimize_referred_value(function_definition['body'])
}
}
elif 'functionInvocationValue' in value:
function_invocation = value['functionInvocationValue']
arguments = function_invocation['arguments']
optimized_invocation = {}
if 'functionName' in function_invocation:
optimized_invocation['functionName'] = function_invocation[
'functionName']
else:
optimized_invocation[
'functionReference'] = self._optimize_referred_value(
function_invocation['functionReference'])
optimized_invocation['arguments'] = {
k: self._optimize_value(v, depth + 3) for k, v in arguments.items()
}
return {'functionInvocationValue': optimized_invocation}
elif 'valueReference' in value:
# Lift if possible: anything used only here, anything lightweight.
reference = value['valueReference']
if not self._is_compound():
return self._optimize_value(reference, depth)
referenced_value = self._values[reference]
if reference in self._single_uses and depth < _DEPTH_LIMIT:
return self._optimize_value(referenced_value, depth)
else:
if self._is_always_liftable(referenced_value):
return referenced_value
return {'valueReference': self._optimize_referred_value(reference)}
def _is_always_liftable(self, value: Any) -> bool:
"""Determines if a value is simple enough to lift unconditionally."""
# Non-string constants and argument references are simple enough.
if 'constantValue' in value:
return self._is_liftable_constant(value['constantValue'])
else:
return 'argumentReference' in value
def _is_liftable_constant(self, value: Any) -> bool:
"""Whether a constant is simple enough to lift to where it's referenced."""
return value is None or isinstance(value, (bool, numbers.Number))
def _is_constant_value(self, value: Any) -> bool:
"""Whether a ValueNode (as a dict) is a constant."""
return 'constantValue' in value
def _visit_all_values_in_expression(self, visitor: Any) -> None:
"""Calls visitor on all ValueNodes in the expression.
Args:
visitor: A callable that will be invoked once at every ValueNode in the
expression, including nested ValueNodes.
"""
self._visit_all_values(self._result, self._values[self._result], set(),
visitor)
def _visit_all_values(
self, reference: Any, value: Any, visited: Any, visitor: Any
) -> None:
"""Calls visitor on a ValueNode and its descendants.
Args:
reference: A reference to the ValueNode, or None.
value: The ValueNode, in dict form.
visited: A set of references for which the visitor has already been
invoked.
visitor: The callable to invoke.
"""
if reference is not None:
if reference in visited:
return
visited.add(reference)
visitor(value)
if 'arrayValue' in value:
for v in value['arrayValue']['values']:
self._visit_all_values(None, v, visited, visitor)
elif 'dictionaryValue' in value:
d = value['dictionaryValue']['values']
for k in sorted(d):
self._visit_all_values(None, d[k], visited, visitor)
elif 'functionDefinitionValue' in value:
definition_reference = value['functionDefinitionValue']['body']
self._visit_all_values(definition_reference,
self._values[definition_reference], visited,
visitor)
elif 'functionInvocationValue' in value:
function_invocation = value['functionInvocationValue']
if 'functionReference' in function_invocation:
function_reference = function_invocation['functionReference']
self._visit_all_values(function_reference,
self._values[function_reference], visited,
visitor)
arguments = function_invocation['arguments']
for k in sorted(arguments):
self._visit_all_values(None, arguments[k], visited, visitor)
elif 'valueReference' in value:
value_reference = value['valueReference']
self._visit_all_values(value_reference, self._values[value_reference],
visited, visitor)