earthengine-api/python/ee/clusterer.py
Kurt Schwehr ff7a5fbdbc ee pyupgrade --py39-plus
PiperOrigin-RevId: 780619373
2025-07-08 10:27:19 -07:00

336 lines
12 KiB
Python

"""A wrapper for Clusterers."""
from __future__ import annotations
from ee import _arg_types
from ee import apifunction
from ee import computedobject
from ee import ee_list
class Clusterer(computedobject.ComputedObject):
"""An object to represent an Earth Engine Clusterer.
Example:
# Load a pre-computed Landsat composite for input.
input_img = ee.Image('LANDSAT/LE7_TOA_1YEAR/2001')
# Define a region in which to generate a sample of the input.
region = ee.Geometry.Rectangle(29.7, 30, 32.5, 31.7)
# Make the training dataset.
training = input_img.sample(region=region, scale=30, numPixels=5000)
# Instantiate the clusterer and train it.
clusterer = ee.Clusterer.wekaKMeans(15).train(training)
# Cluster the input using the trained clusterer.
result = input_img.cluster(clusterer)
"""
_initialized: bool = False
def __init__(
self,
clusterer: computedobject.ComputedObject,
):
"""Creates a Clusterer wrapper.
Args:
clusterer: A Clusterer to cast.
"""
self.initialize()
if isinstance(clusterer, computedobject.ComputedObject):
# There is no server-side constructor for ee.Clusterer. Pass the object
# as-is to the server in case it is intended to be a Clusterer cast.
super().__init__(clusterer.func, clusterer.args, clusterer.varName)
return
raise TypeError(
'Clusterer can only be used as a cast to Clusterer. Found'
f' {type(clusterer)}.'
)
@classmethod
def initialize(cls) -> None:
"""Imports API functions to this class."""
if not cls._initialized:
apifunction.ApiFunction.importApi(cls, cls.name(), cls.name())
cls._initialized = True
@classmethod
def reset(cls) -> None:
"""Removes imported API functions from this class."""
apifunction.ApiFunction.clearApi(cls)
cls._initialized = False
@staticmethod
def name() -> str:
return 'Clusterer'
# TODO: The Optional is suspect. The return from schema is
# never None, but the .getInfo() of the result can be None.
def schema(self) -> ee_list.List | None:
"""Returns the names of the inputs used by this Clusterer.
Or None if this Clusterer has not had any training data added yet.
"""
return apifunction.ApiFunction.call_(self.name() + '.schema', self)
def train(
self,
features: _arg_types.FeatureCollection,
# pylint: disable-next=invalid-name
inputProperties: _arg_types.List | None = None,
subsampling: _arg_types.Number | None = None,
# pylint: disable-next=invalid-name
subsamplingSeed: _arg_types.Integer | None = None,
) -> Clusterer:
"""Returns a trained Clusterer.
Trains the Clusterer on a collection of features using the specified
numeric properties of each feature as training data. The geometry of the
features is ignored.
Args:
features: The collection to train on.
inputProperties: The list of property names to include as training data.
Each feature must have all these properties, and their values must be
numeric. This argument is optional if the input collection contains a
'band_order' property (as produced by Image.sample).
subsampling: An optional subsampling factor, within (0, 1].
subsamplingSeed: A randomization seed to use for subsampling.
"""
return apifunction.ApiFunction.call_(
self.name() + '.train',
self,
features,
inputProperties,
subsampling,
subsamplingSeed,
)
@staticmethod
def wekaCascadeKMeans(
# pylint: disable=invalid-name
minClusters: _arg_types.Integer | None = None,
maxClusters: _arg_types.Integer | None = None,
# pylint: disable-next=invalid-name
restarts: _arg_types.Integer | None = None,
manual: _arg_types.Bool | None = None,
init: _arg_types.Bool | None = None,
# pylint: disable=invalid-name
distanceFunction: _arg_types.String | None = None,
maxIterations: _arg_types.Integer | None = None,
# pylint: disable-next=invalid-name
) -> Clusterer:
"""Returns a weka Cascade K-Means Clusterer.
Cascade simple k-means, selects the best k according to the
Calinski-Harabasz criterion. For more information see:
Calinski, T. and J. Harabasz. 1974. A dendrite method for cluster analysis,
Commun. Stat. 3: 1-27.
Args:
minClusters: Min number of clusters.
maxClusters: Max number of clusters.
restarts: Number of restarts.
manual: Manually select the number of clusters.
init: Set whether to initialize using the probabilistic farthest first
like method of the k-means++ algorithm (rather than the standard random
selection of initial cluster centers).
distanceFunction: Distance function to use. Options are: Euclidean and
Manhattan.
maxIterations: Maximum number of iterations for k-means.
"""
return apifunction.ApiFunction.call_(
'Clusterer.wekaCascadeKMeans',
minClusters,
maxClusters,
restarts,
manual,
init,
distanceFunction,
maxIterations,
)
@staticmethod
def wekaCobweb(
acuity: _arg_types.Number | None = None,
cutoff: _arg_types.Number | None = None,
seed: _arg_types.Integer | None = None,
) -> Clusterer:
"""Returns a weka Cobweb Clusterer.
Implementation of the Cobweb clustering algorithm. For more information see:
D. Fisher (1987). Knowledge acquisition via incremental conceptual
clustering. Machine Learning. 2(2):139-172. and J. H. Gennari, P. Langley,
D. Fisher (1990). Models of incremental concept formation. Artificial
Intelligence. 40:11-61.
Args:
acuity: Acuity (minimum standard deviation).
cutoff: Cutoff (minimum category utility).
seed: Random number seed.
"""
return apifunction.ApiFunction.call_(
'Clusterer.wekaCobweb', acuity, cutoff, seed
)
@staticmethod
def wekaKMeans(
nClusters: _arg_types.Integer, # pylint: disable=invalid-name
init: _arg_types.Integer | None = None,
canopies: _arg_types.Bool | None = None,
# pylint: disable=invalid-name
maxCandidates: _arg_types.Integer | None = None,
periodicPruning: _arg_types.Integer | None = None,
minDensity: _arg_types.Integer | None = None,
# pylint: enable=invalid-name
t1: _arg_types.Number | None = None,
t2: _arg_types.Number | None = None,
# pylint: disable=invalid-name
distanceFunction: _arg_types.String | None = None,
maxIterations: _arg_types.Integer | None = None,
preserveOrder: _arg_types.Bool | None = None,
# pylint: enable=invalid-name
fast: _arg_types.Bool | None = None,
seed: _arg_types.Integer | None = None,
) -> Clusterer:
"""Returns a weka K-Means Clusterer.
Cluster data using the k-means algorithm. Can use either the Euclidean
distance (default) or the Manhattan distance. If the Manhattan distance is
used, then centroids are computed as the component-wise median rather than
mean. For more information see: D. Arthur, S. Vassilvitskii: k-means++: the
advantages of careful seeding. In: Proceedings of the eighteenth annual
ACM-SIAM symposium on Discrete algorithms, 1027-1035, 2007.
Args:
nClusters: Number of clusters.
init: Initialization method to use. 0 = random, 1 = k-means++, 2 = canopy,
3 = farthest first.
canopies: Use canopies to reduce the number of distance calculations.
maxCandidates: Maximum number of candidate canopies to retain in memory at
any one time when using canopy clustering. T2 distance plus, data
characteristics, will determine how many candidate canopies are formed
before periodic and final pruning are performed, which might result in
exceess memory consumption. This setting avoids large numbers of
candidate canopies consuming memory.
periodicPruning: How often to prune low density canopies when using canopy
clustering.
minDensity: Minimum canopy density, when using canopy clustering, below
which a canopy will be pruned during periodic pruning.
t1: The T1 distance to use when using canopy clustering. A value < 0 is
taken as a positive multiplier for T2.
t2: The T2 distance to use when using canopy clustering. Values < 0 cause
a heuristic based on attribute std. deviation to be used.
distanceFunction: Distance function to use. Options are: Euclidean and
Manhattan.
maxIterations: Maximum number of iterations.
preserveOrder: Preserve order of instances.
fast: Enables faster distance calculations, using cut-off values. Disables
the calculation/output of squared errors/distances.
seed: The randomization seed.
"""
return apifunction.ApiFunction.call_(
'Clusterer.wekaKMeans',
nClusters,
init,
canopies,
maxCandidates,
periodicPruning,
minDensity,
t1,
t2,
distanceFunction,
maxIterations,
preserveOrder,
fast,
seed,
)
@staticmethod
def wekaLVQ(
# pylint: disable=invalid-name
numClusters: _arg_types.Integer | None = None,
learningRate: _arg_types.Number | None = None,
# pylint: enable=invalid-name
epochs: _arg_types.Integer | None = None,
# pylint: disable-next=invalid-name
normalizeInput: _arg_types.Bool | None = None,
) -> Clusterer:
"""Returns a weka Learning Vector Quantization (LVQ) Clusterer.
A Clusterer that implements the Learning Vector Quantization algorithm. For
more details, see: T. Kohonen, "Learning Vector Quantization", The Handbook
of Brain Theory and Neural Networks, 2nd Edition, MIT Press, 2003, pp.
631-634.
Args:
numClusters: The number of clusters.
learningRate: The learning rate for the training algorithm. Value should
be greater than 0 and less or equal to 1.
epochs: Number of training epochs. Value should be greater than or equal
to 1.
normalizeInput: Skip normalizing the attributes.
"""
return apifunction.ApiFunction.call_(
'Clusterer.wekaLVQ', numClusters, learningRate, epochs, normalizeInput
)
@staticmethod
def wekaXMeans(
# pylint: disable=invalid-name
minClusters: _arg_types.Integer | None = None,
maxClusters: _arg_types.Integer | None = None,
maxIterations: _arg_types.Integer | None = None,
maxKMeans: _arg_types.Integer | None = None,
maxForChildren: _arg_types.Integer | None = None,
useKD: _arg_types.Bool | None = None,
cutoffFactor: _arg_types.Number | None = None,
distanceFunction: _arg_types.String | None = None,
# pylint: enable=invalid-name
seed: _arg_types.Integer | None = None,
) -> Clusterer:
"""Returns a weka X-Means Clusterer.
X-Means is K-Means with an efficient estimation of the number of clusters.
For more information see: Dan Pelleg, Andrew W. Moore: X-means: Extending
K-means with Efficient Estimation of the Number of Clusters. In: Seventeenth
International Conference on Machine Learning, 727-734, 2000.
Args:
minClusters: Minimum number of clusters.
maxClusters: Maximum number of clusters.
maxIterations: Maximum number of overall iterations.
maxKMeans: The maximum number of iterations to perform in KMeans.
maxForChildren: The maximum number of iterations in KMeans that is
performed on the child centers.
useKD: Use a KDTree.
cutoffFactor: Takes the given percentage of the split centroids if none of
the children win.
distanceFunction: Distance function to use. Options are: Chebyshev,
Euclidean, and Manhattan.
seed: The randomization seed.
"""
return apifunction.ApiFunction.call_(
'Clusterer.wekaXMeans',
minClusters,
maxClusters,
maxIterations,
maxKMeans,
maxForChildren,
useKD,
cutoffFactor,
distanceFunction,
seed,
)