mirror of
https://github.com/meteoinfo/MeteoInfo.git
synced 2025-12-08 20:36:05 +00:00
1114 lines
39 KiB
Python
1114 lines
39 KiB
Python
# coding=utf-8
|
||
#-----------------------------------------------------
|
||
# Author: Yaqiang Wang
|
||
# Date: 2017-3-7
|
||
# Purpose: MeteoInfo DataFrame module
|
||
# Note: Jython
|
||
#-----------------------------------------------------
|
||
|
||
import datetime
|
||
|
||
from org.meteoinfo.dataframe import DataFrame as MIDataFrame
|
||
from org.meteoinfo.dataframe import Series as MISeries
|
||
from org.meteoinfo.ndarray import Range, Array
|
||
|
||
import mipylib.numeric as np
|
||
import mipylib.miutil as miutil
|
||
from index import Index
|
||
import series
|
||
import groupby
|
||
from indexing import LocIndexer, ILocIndexer, AtIndexer, IAtIndexer
|
||
|
||
from java.lang import Double
|
||
nan = Double.NaN
|
||
|
||
class DataFrame(object):
|
||
"""
|
||
Two-dimensional size-mutable, potentially heterogeneous tabular data structure with
|
||
labeled axes (rows and columns). Arithmetic operations align on both row and column
|
||
labels. Can be thought of as a dict-like container for Series objects.
|
||
|
||
:param data: (*array_like*) Two-dimensional array data or list of one-dimensional arrays.
|
||
:param index: (*list*) Data index list. Values must be unique and hashable, same length as data.
|
||
:param columns: (*list*) Column labels to use for resulting frame. Will default to
|
||
arange(n) if no column labels are provided
|
||
"""
|
||
def __init__(self, data=None, index=None, columns=None, dataframe=None):
|
||
if dataframe is None:
|
||
if not data is None:
|
||
if isinstance(data, dict):
|
||
columns = data.keys()
|
||
dlist = []
|
||
n = 1
|
||
for v in data.values():
|
||
if isinstance(v, (list, tuple)):
|
||
n = len(v)
|
||
v = np.array(v)
|
||
elif isinstance(v, np.NDArray):
|
||
n = len(v)
|
||
dlist.append(v)
|
||
for i in range(len(dlist)):
|
||
d = dlist[i]
|
||
if not isinstance(d, np.NDArray):
|
||
d = [d] * n
|
||
d = np.array(d)
|
||
dlist[i] = d
|
||
data = dlist
|
||
|
||
if isinstance(data, np.NDArray):
|
||
n = len(data)
|
||
data = data._array
|
||
else:
|
||
dlist = []
|
||
n = len(data[0])
|
||
for dd in data:
|
||
dlist.append(dd._array)
|
||
data = dlist
|
||
|
||
if index is None:
|
||
index = range(0, n)
|
||
else:
|
||
if n != len(index):
|
||
raise ValueError('Wrong length of index!')
|
||
|
||
if isinstance(index, np.NDArray):
|
||
index = index.tolist()
|
||
|
||
if isinstance(index, Index):
|
||
self._index = index
|
||
else:
|
||
self._index = Index.factory(index)
|
||
if data is None:
|
||
self._dataframe = MIDataFrame(self._index._index)
|
||
else:
|
||
self._dataframe = MIDataFrame(data, self._index._index, columns)
|
||
else:
|
||
self._dataframe = dataframe
|
||
self._index = Index.factory(index=self._dataframe.getIndex())
|
||
|
||
#---- index property
|
||
def get_index(self):
|
||
return self._index
|
||
|
||
def set_index(self, value):
|
||
if isinstance(value, series.Series):
|
||
value = value.values
|
||
self._index = Index.factory(value, self._index.name)
|
||
self._dataframe.setIndex(self._index._index)
|
||
|
||
index = property(get_index, set_index)
|
||
|
||
#---- data property
|
||
def get_data(self):
|
||
r = self._dataframe.getData()
|
||
if isinstance(r, Array):
|
||
r = np.array(r)
|
||
else:
|
||
rr = []
|
||
for d in r:
|
||
rr.append(np.array(d))
|
||
r = rr
|
||
return r
|
||
|
||
def set_data(self, value):
|
||
value = np.array(value)
|
||
self._dataframe.setData(value._array)
|
||
|
||
values = property(get_data, set_data)
|
||
|
||
#---- columns property
|
||
def get_columns(self):
|
||
return self._dataframe.getColumns()
|
||
|
||
def set_columns(self, value):
|
||
self._dataframe.setColumns(value)
|
||
|
||
columns = property(get_columns, set_columns)
|
||
|
||
#---- shape property
|
||
def get_shape(self):
|
||
s = self._dataframe.getShape()
|
||
s1 = []
|
||
for i in range(len(s)):
|
||
s1.append(s[i])
|
||
return tuple(s1)
|
||
|
||
shape = property(get_shape)
|
||
|
||
#---- dtypes property
|
||
def get_dtypes(self):
|
||
colnames = list(self.columns.getNames())
|
||
datatypes = list(self.columns.getDataTypes())
|
||
r = series.Series(datatypes, colnames, 'DataTypes')
|
||
return r
|
||
|
||
dtypes = property(get_dtypes)
|
||
|
||
@property
|
||
def loc(self):
|
||
"""
|
||
Access a group of rows and columns by label(s) or a boolean array.
|
||
"""
|
||
return LocIndexer(self)
|
||
|
||
@property
|
||
def iloc(self):
|
||
"""
|
||
Purely integer-location based indexing for selection by position.
|
||
"""
|
||
return ILocIndexer(self)
|
||
|
||
@property
|
||
def at(self):
|
||
"""
|
||
Access a single value for a row/column label pair.
|
||
"""
|
||
return AtIndexer(self)
|
||
|
||
@property
|
||
def iat(self):
|
||
"""
|
||
Access a single value for a row/column pair by integer position.
|
||
"""
|
||
return IAtIndexer(self)
|
||
|
||
def __getitem__(self, key):
|
||
if isinstance(key, basestring):
|
||
data = self._dataframe.getColumnData(key)
|
||
if data is None:
|
||
return data
|
||
idx = self._index[:]
|
||
r = series.Series(np.array(data), idx, key)
|
||
return r
|
||
|
||
hascolkey = True
|
||
if isinstance(key, tuple):
|
||
ridx = key[0]
|
||
cidx = key[1]
|
||
if isinstance(ridx, int) and isinstance(cidx, int):
|
||
if ridx < 0:
|
||
ridx = self.shape[0] + ridx
|
||
if cidx < 0:
|
||
cidx = self.shape[1] + cidx
|
||
return self._dataframe.getValue(ridx, cidx)
|
||
elif isinstance(ridx, int) and isinstance(cidx, basestring):
|
||
if ridx < 0:
|
||
ridx = self.shape[0] + ridx
|
||
return self._dataframe.getValue(ridx, cidx)
|
||
else:
|
||
key = (key, slice(None))
|
||
hascolkey = False
|
||
|
||
k = key[0]
|
||
if isinstance(k, Index):
|
||
k = k.data
|
||
if isinstance(k, int):
|
||
if k < 0:
|
||
k = self.shape[0] + k
|
||
rowkey = k
|
||
elif isinstance(k, basestring):
|
||
sidx = self._index.index(k)
|
||
if sidx < 0:
|
||
return None
|
||
eidx = sidx
|
||
step = 1
|
||
rowkey = Range(sidx, eidx, step)
|
||
elif isinstance(k, slice):
|
||
if isinstance(k.start, basestring):
|
||
sidx = self._index.index(k.start)
|
||
if sidx < 0:
|
||
sidx = 0
|
||
else:
|
||
sidx = 0 if k.start is None else k.start
|
||
if sidx < 0:
|
||
sidx = self.shape[0] + sidx
|
||
if isinstance(k.stop, basestring):
|
||
eidx = self._index.index(k.stop)
|
||
if eidx < 0:
|
||
eidx = self.shape[0] + eidx
|
||
else:
|
||
eidx = self.shape[0] - 1 if k.stop is None else k.stop - 1
|
||
if eidx < 0:
|
||
eidx = self.shape[0] + eidx
|
||
step = 1 if k.step is None else k.step
|
||
rowkey = Range(sidx, eidx, step)
|
||
elif isinstance(k, (list,tuple,np.NDArray,series.Series)):
|
||
if isinstance(k, series.Series):
|
||
k0 = k.iloc[0]
|
||
else:
|
||
k0 = k[0]
|
||
if isinstance(k0, (int, bool)):
|
||
if isinstance(k, (list, tuple)):
|
||
rowkey = k
|
||
else:
|
||
rowkey = k.asarray()
|
||
else:
|
||
tlist = []
|
||
for tstr in k:
|
||
idx = self._index.index(tstr)
|
||
if idx >= 0:
|
||
tlist.append(idx)
|
||
rowkey = tlist
|
||
else:
|
||
rowkey = self._index.get_loc(k)
|
||
|
||
if not hascolkey:
|
||
colkey = Range(0, self.shape[1] - 1, 1)
|
||
else:
|
||
k = key[1]
|
||
if isinstance(k, int):
|
||
sidx = k
|
||
if sidx < 0:
|
||
sidx = self.shape[1] + sidx
|
||
eidx = sidx
|
||
step = 1
|
||
colkey = Range(sidx, eidx, step)
|
||
elif isinstance(k, slice):
|
||
sidx = 0 if k.start is None else k.start
|
||
if sidx < 0:
|
||
sidx = self.shape[1] + sidx
|
||
eidx = self.shape[1] - 1 if k.stop is None else k.stop - 1
|
||
if eidx < 0:
|
||
eidx = self.shape[1] + eidx
|
||
step = 1 if k.step is None else k.step
|
||
colkey = Range(sidx, eidx, step)
|
||
elif isinstance(k, list):
|
||
if isinstance(k[0], int):
|
||
colkey = k
|
||
else:
|
||
colkey = self.columns.indexOfName(k)
|
||
elif isinstance(k, basestring):
|
||
col = self.columns.indexOf(k)
|
||
colkey = Range(col, col + 1, 1)
|
||
else:
|
||
return None
|
||
|
||
r = self._dataframe.select(rowkey, colkey)
|
||
if r is None:
|
||
return None
|
||
if isinstance(r, MISeries):
|
||
r = series.Series(series=r)
|
||
else:
|
||
r = DataFrame(dataframe=r)
|
||
return r
|
||
|
||
def __setitem__(self, key, value):
|
||
if isinstance(value, datetime.datetime):
|
||
value = miutil.jdatetime(value)
|
||
if isinstance(value, (list, tuple)):
|
||
if isinstance(value[0], datetime.datetime):
|
||
value = miutil.jdatetime(value)
|
||
value = np.array(value)
|
||
if isinstance(value, np.NDArray):
|
||
value = value._array
|
||
|
||
if isinstance(key, basestring):
|
||
if isinstance(value, series.Series):
|
||
value = value.values._array
|
||
self._dataframe.setColumn(key, value)
|
||
return
|
||
|
||
if isinstance(key, DataFrame):
|
||
key = key.values
|
||
|
||
if isinstance(key, np.NDArray):
|
||
self._dataframe.setValues(key.asarray(), value)
|
||
|
||
hascolkey = True
|
||
if isinstance(key, tuple):
|
||
ridx = key[0]
|
||
cidx = key[1]
|
||
if isinstance(ridx, int) and isinstance(cidx, int):
|
||
if ridx < 0:
|
||
ridx = self.shape[0] + ridx
|
||
if cidx < 0:
|
||
cidx = self.shape[1] + cidx
|
||
self._dataframe.setValue(ridx, cidx, value)
|
||
return
|
||
elif isinstance(ridx, int) and isinstance(cidx, basestring):
|
||
if ridx < 0:
|
||
ridx = self.shape[0] + ridx
|
||
self._dataframe.setValue(ridx, cidx, value)
|
||
return
|
||
else:
|
||
key = (key, slice(None))
|
||
hascolkey = False
|
||
|
||
k = key[0]
|
||
if isinstance(k, int):
|
||
if k < 0:
|
||
k = self.shape[0] + k
|
||
rowkey = k
|
||
elif isinstance(k, basestring):
|
||
sidx = self._index.index(k)
|
||
if sidx < 0:
|
||
return None
|
||
eidx = sidx
|
||
step = 1
|
||
rowkey = Range(sidx, eidx, step)
|
||
elif isinstance(k, slice):
|
||
if isinstance(k.start, basestring):
|
||
sidx = self._index.index(k.start)
|
||
if sidx < 0:
|
||
sidx = 0
|
||
else:
|
||
sidx = 0 if k.start is None else k.start
|
||
if sidx < 0:
|
||
sidx = self.shape[0] + sidx
|
||
if isinstance(k.stop, basestring):
|
||
eidx = self._index.index(k.stop)
|
||
if eidx < 0:
|
||
eidx = self.shape[0] + eidx
|
||
else:
|
||
eidx = self.shape[0] - 1 if k.stop is None else k.stop - 1
|
||
if eidx < 0:
|
||
eidx = self.shape[0] + eidx
|
||
step = 1 if k.step is None else k.step
|
||
rowkey = Range(sidx, eidx, step)
|
||
elif isinstance(k, list):
|
||
if isinstance(k[0], int):
|
||
rowkey = k
|
||
else:
|
||
tlist = []
|
||
for tstr in k:
|
||
idx = self._index.index(tstr)
|
||
if idx >= 0:
|
||
tlist.append(idx)
|
||
rowkey = tlist
|
||
else:
|
||
return
|
||
|
||
if not hascolkey:
|
||
colkey = Range(0, self.shape[1] - 1, 1)
|
||
else:
|
||
k = key[1]
|
||
if isinstance(k, int):
|
||
sidx = k
|
||
if sidx < 0:
|
||
sidx = self.shape[1] + sidx
|
||
eidx = sidx
|
||
step = 1
|
||
colkey = Range(sidx, eidx, step)
|
||
elif isinstance(k, slice):
|
||
sidx = 0 if k.start is None else k.start
|
||
if sidx < 0:
|
||
sidx = self.shape[1] + sidx
|
||
eidx = self.shape[1] - 1 if k.stop is None else k.stop - 1
|
||
if eidx < 0:
|
||
eidx = self.shape[1] + eidx
|
||
step = 1 if k.step is None else k.step
|
||
colkey = Range(sidx, eidx, step)
|
||
elif isinstance(k, list):
|
||
if isinstance(k[0], int):
|
||
colkey = k
|
||
else:
|
||
colkey = self.columns.indexOfName(k)
|
||
elif isinstance(k, basestring):
|
||
col = self.columns.indexOf(k)
|
||
colkey = Range(col, col + 1, 1)
|
||
else:
|
||
return
|
||
|
||
self._dataframe.setValues(rowkey, colkey, value)
|
||
|
||
def _getitem_loc(self, key):
|
||
if not isinstance(key, tuple):
|
||
key = (key, None)
|
||
|
||
k = key[0]
|
||
rkeys = key[0]
|
||
if isinstance(k, slice):
|
||
sidx = 0 if k.start is None else self._index.index(k.start)
|
||
if sidx < 0:
|
||
raise KeyError(key)
|
||
eidx = self.shape[0] - 1 if k.stop is None else self._index.index(k.stop)
|
||
if eidx < 0:
|
||
raise KeyError(key)
|
||
step = 1 if k.step is None else k.step
|
||
rowkey = Range(sidx, eidx, step)
|
||
else:
|
||
rloc = self._index.get_loc(k, outkeys=True)
|
||
if isinstance(rloc, tuple):
|
||
rowkey = rloc[0]
|
||
rkeys = rloc[1]
|
||
else:
|
||
rowkey = rloc
|
||
rkeys = None
|
||
if len(rowkey) == 0:
|
||
raise KeyError(key)
|
||
|
||
k = key[1]
|
||
if k is None:
|
||
colkey = Range(0, self.shape[1] - 1, 1)
|
||
else:
|
||
if isinstance(k, slice):
|
||
sidx = 0 if k.start is None else self.columns.indexOfName(k.start)
|
||
if sidx < 0:
|
||
raise KeyError(key)
|
||
eidx = self.shape[1] - 1 if k.stop is None else self.columns.indexOfName(k.stop)
|
||
if eidx < 0:
|
||
raise KeyError(key)
|
||
step = 1 if k.step is None else k.step
|
||
colkey = Range(sidx, eidx, step)
|
||
elif isinstance(k, list):
|
||
colkey = self.columns.indexOfName(k)
|
||
elif isinstance(k, basestring):
|
||
col = self.columns.indexOfName(k)
|
||
if col < 0:
|
||
raise KeyError(key)
|
||
colkey = [col]
|
||
else:
|
||
return None
|
||
|
||
if isinstance(rowkey, (int, Range)):
|
||
r = self._dataframe.select(rowkey, colkey)
|
||
else:
|
||
if isinstance(colkey, Range):
|
||
ncol = colkey.length()
|
||
else:
|
||
ncol = len(colkey)
|
||
if len(rowkey) == 1 and ncol == 1:
|
||
if isinstance(colkey, Range):
|
||
return self._dataframe.getValue(rowkey[0], colkey.first())
|
||
else:
|
||
return self._dataframe.getValue(rowkey[0], colkey[0])
|
||
if rkeys is None:
|
||
r = self._dataframe.select(rowkey, colkey)
|
||
else:
|
||
if not isinstance(rkeys, list):
|
||
rkeys = [rkeys]
|
||
r = self._dataframe.select(rkeys, rowkey, colkey)
|
||
if r is None:
|
||
return None
|
||
if isinstance(r, MISeries):
|
||
r = series.Series(series=r)
|
||
else:
|
||
r = DataFrame(dataframe=r)
|
||
return r
|
||
|
||
def _setitem_loc(self, key, value):
|
||
if isinstance(value, datetime.datetime):
|
||
value = miutil.jdatetime(value)
|
||
if isinstance(value, (list, tuple)):
|
||
value = np.array(value)
|
||
if isinstance(value, np.NDArray):
|
||
value = value._array
|
||
|
||
if not isinstance(key, tuple):
|
||
key = (key, None)
|
||
|
||
k = key[0]
|
||
rkeys = key[0]
|
||
if isinstance(k, slice):
|
||
sidx = 0 if k.start is None else self._index.index(k.start)
|
||
if sidx < 0:
|
||
raise KeyError(key)
|
||
eidx = self.shape[0] - 1 if k.stop is None else self._index.index(k.stop)
|
||
if eidx < 0:
|
||
raise KeyError(key)
|
||
step = 1 if k.step is None else k.step
|
||
rowkey = Range(sidx, eidx, step)
|
||
else:
|
||
rloc = self._index.get_loc(k, outkeys=True)
|
||
if isinstance(rloc, tuple):
|
||
rowkey = rloc[0]
|
||
rkeys = rloc[1]
|
||
else:
|
||
rowkey = rloc
|
||
rkeys = None
|
||
if len(rowkey) == 0:
|
||
raise KeyError(key)
|
||
|
||
k = key[1]
|
||
if k is None:
|
||
colkey = Range(0, self.shape[1] - 1, 1)
|
||
else:
|
||
if isinstance(k, slice):
|
||
sidx = 0 if k.start is None else self.columns.indexOfName(k.start)
|
||
if sidx < 0:
|
||
raise KeyError(key)
|
||
eidx = self.shape[1] - 1 if k.stop is None else self.columns.indexOfName(k.stop)
|
||
if eidx < 0:
|
||
raise KeyError(key)
|
||
step = 1 if k.step is None else k.step
|
||
colkey = Range(sidx, eidx, step)
|
||
elif isinstance(k, list):
|
||
colkey = self.columns.indexOfName(k)
|
||
elif isinstance(k, basestring):
|
||
col = self.columns.indexOfName(k)
|
||
if col < 0:
|
||
raise KeyError(key)
|
||
colkey = [col]
|
||
else:
|
||
raise KeyError(key)
|
||
|
||
if isinstance(rowkey, (int, Range)):
|
||
r = self._dataframe.setValues(rowkey, colkey, value)
|
||
else:
|
||
if isinstance(colkey, Range):
|
||
ncol = colkey.length()
|
||
else:
|
||
ncol = len(colkey)
|
||
if len(rowkey) == 1 and ncol == 1:
|
||
if isinstance(colkey, Range):
|
||
self._dataframe.setValue(rowkey[0], colkey.first(), value)
|
||
else:
|
||
self._dataframe.setValue(rowkey[0], colkey[0], value)
|
||
else:
|
||
r = self._dataframe.setValues(rowkey, colkey, value)
|
||
|
||
def _getitem_iloc(self, key):
|
||
if not isinstance(key, tuple):
|
||
key = (key, None)
|
||
|
||
if isinstance(key[0], int) and isinstance(key[1], int):
|
||
return self._dataframe.getValue(key[0], key[1])
|
||
|
||
k = key[0]
|
||
if isinstance(k, int):
|
||
if k < 0:
|
||
k = self.shape[0] + k
|
||
rowkey = k
|
||
elif isinstance(k, slice):
|
||
sidx = 0 if k.start is None else k.start
|
||
if sidx < 0:
|
||
sidx = self.shape[0] + sidx
|
||
eidx = self.shape[0] - 1 if k.stop is None else k.stop - 1
|
||
if eidx < 0:
|
||
eidx = self.shape[0] + eidx
|
||
step = 1 if k.step is None else k.step
|
||
rowkey = Range(sidx, eidx, step)
|
||
elif isinstance(k, list):
|
||
rowkey = k
|
||
elif isinstance(k, np.NDArray):
|
||
rowkey = k.aslist()
|
||
else:
|
||
return None
|
||
|
||
k = key[1]
|
||
if k is None:
|
||
colkey = Range(0, self.shape[1] - 1, 1)
|
||
else:
|
||
if isinstance(k, int):
|
||
sidx = k
|
||
if sidx < 0:
|
||
sidx = self.shape[1] + sidx
|
||
eidx = sidx
|
||
step = 1
|
||
colkey = Range(sidx, eidx, step)
|
||
elif isinstance(k, slice):
|
||
sidx = 0 if k.start is None else k.start
|
||
if sidx < 0:
|
||
sidx = self.shape[1] + sidx
|
||
eidx = self.shape[1] - 1 if k.stop is None else k.stop - 1
|
||
if eidx < 0:
|
||
eidx = self.shape[1] + eidx
|
||
step = 1 if k.step is None else k.step
|
||
colkey = Range(sidx, eidx, step)
|
||
elif isinstance(k, list):
|
||
colkey = k
|
||
elif isinstance(k, np.NDArray):
|
||
colkey = k.aslist()
|
||
else:
|
||
return None
|
||
|
||
r = self._dataframe.select(rowkey, colkey)
|
||
if r is None:
|
||
return None
|
||
if isinstance(r, MISeries):
|
||
r = series.Series(series=r)
|
||
else:
|
||
r = DataFrame(dataframe=r)
|
||
return r
|
||
|
||
def _getitem_at(self, key):
|
||
ridx = key[0]
|
||
cidx = key[1]
|
||
ridx = self._index.index(ridx)
|
||
if ridx < 0:
|
||
raise KeyError(key)
|
||
cidx = self.columns.indexOfName(cidx)
|
||
if cidx < 0:
|
||
raise KeyError(key)
|
||
return self._dataframe.getValue(ridx, cidx)
|
||
|
||
def _getitem_iat(self, key):
|
||
ridx = key[0]
|
||
cidx = key[1]
|
||
if ridx < 0:
|
||
ridx = self.shape[0] + ridx
|
||
if cidx < 0:
|
||
cidx = self.shape[1] + cidx
|
||
return self._dataframe.getValue(ridx, cidx)
|
||
|
||
def __getkey(self, key):
|
||
if isinstance(key, basestring):
|
||
rkey = self.index.get_indices(key)
|
||
ikey = rkey[0]
|
||
rindex = rkey[1]
|
||
if len(ikey) == 1:
|
||
ikey = ikey[0]
|
||
elif len(ikey) > 1:
|
||
ikey = list(ikey)
|
||
else:
|
||
raise KeyError(key)
|
||
return ikey, rindex
|
||
elif isinstance(key, (list, tuple, np.NDArray)) and isinstance(key[0], basestring):
|
||
if isinstance(key, (np.NDArray)):
|
||
key = key.asarray()
|
||
rkey = self.index.get_indices(key)
|
||
ikey = rkey[0]
|
||
rindex = rkey[1]
|
||
rdata = rkey[2]
|
||
rrindex = rkey[3]
|
||
if len(ikey) == 0:
|
||
raise KeyError()
|
||
else:
|
||
ikey = list(ikey)
|
||
return ikey, rindex, rdata, rrindex
|
||
else:
|
||
return key, None
|
||
|
||
def __iter__(self):
|
||
"""
|
||
provide iteration over the values of the Series
|
||
"""
|
||
#return iter(self.data)
|
||
#return zip(iter(self.index), iter(self.data))
|
||
return iter(self.index)
|
||
|
||
def iteritems(self):
|
||
"""
|
||
Lazily iterate over (index, value) tuples
|
||
"""
|
||
return zip(iter(self.index), iter(self))
|
||
|
||
def __len__(self):
|
||
return self.shape[0]
|
||
|
||
def __str__(self):
|
||
return self._dataframe.toString()
|
||
|
||
def __repr__(self):
|
||
return self._dataframe.toString()
|
||
|
||
def __eq__(self, other):
|
||
r = DataFrame(dataframe=self._dataframe.equal(other))
|
||
return r
|
||
|
||
def __lt__(self, other):
|
||
r = DataFrame(dataframe=self._dataframe.lessThan(other))
|
||
return r
|
||
|
||
def __le__(self, other):
|
||
r = DataFrame(dataframe=self._dataframe.lessThanOrEqual(other))
|
||
return r
|
||
|
||
def __gt__(self, other):
|
||
r = DataFrame(dataframe=self._dataframe.greaterThan(other))
|
||
return r
|
||
|
||
def __ge__(self, other):
|
||
r = DataFrame(dataframe=self._dataframe.greaterThanOrEqual(other))
|
||
return r
|
||
|
||
def head(self, n=5):
|
||
"""
|
||
Get top rows
|
||
|
||
:param n: (*int*) row number.
|
||
|
||
:returns: Top rows
|
||
"""
|
||
print(self._dataframe.head(n))
|
||
|
||
def tail(self, n=5):
|
||
"""
|
||
Get bottom rows
|
||
|
||
:param n: (*int*) row number.
|
||
|
||
:returns: Bottom rows
|
||
"""
|
||
print(self._dataframe.tail(n))
|
||
|
||
def transpose(self):
|
||
"""
|
||
Transpose data frame.
|
||
|
||
:returns: Transposed data frame.
|
||
"""
|
||
r = self._dataframe.transpose()
|
||
return DataFrame(dataframe=r)
|
||
|
||
T = property(transpose)
|
||
|
||
def insert(self, loc, column, value):
|
||
"""
|
||
Insert column into DataFrame at specified location.
|
||
|
||
:param loc: (*int*) Insertation index.
|
||
:param column: (*string*) Label of inserted column.
|
||
:param value: (*array_like*) Column values.
|
||
"""
|
||
if isinstance(value, datetime.datetime):
|
||
value = miutil.jdatetime(value)
|
||
if isinstance(value, (list, tuple)):
|
||
if isinstance(value[0], datetime.datetime):
|
||
value = miutil.jdatetime(value)
|
||
value = np.array(value)
|
||
if isinstance(value, Index):
|
||
if isinstance(value[0], datetime.datetime):
|
||
value = miutil.jdatetime(value.data)
|
||
else:
|
||
value = value.data
|
||
value = np.array(value)
|
||
if isinstance(value, np.NDArray):
|
||
value = value._array
|
||
self._dataframe.addColumn(loc, column, value)
|
||
|
||
def drop(self, labels=None, axis=0, index=None, columns=None):
|
||
"""
|
||
Drop specified labels from rows or columns.
|
||
|
||
:param labels: (*single label or list-like*) Index or column labels to drop.
|
||
:param axis: (*0 or ‘index’, 1 or ‘columns’*) Whether to drop labels from the index (0 or ‘index’)
|
||
or columns (1 or ‘columns’).
|
||
:param index: (*single label or list-like*) Alternative to specifying axis (labels, axis=0 is
|
||
equivalent to index=labels).
|
||
:param columns: (*single label or list-like*) Alternative to specifying axis (labels, axis=1 is
|
||
equivalent to columns=labels).
|
||
|
||
:return: (*DataFrame*) Dropped DataFrame.
|
||
"""
|
||
drop_index = index is not None
|
||
drop_col = columns is not None
|
||
if index is None and columns is None:
|
||
if labels is None:
|
||
print('At least one argument needed: labels, index, columns!')
|
||
raise IndexError
|
||
|
||
if axis == 0:
|
||
index = labels
|
||
else:
|
||
columns = labels
|
||
|
||
if not index is None and not isinstance(index, (list, tuple)):
|
||
index = [index]
|
||
if not columns is None and not isinstance(columns, (list, tuple)):
|
||
columns = [columns]
|
||
|
||
r = self._dataframe.drop(index, columns)
|
||
|
||
return DataFrame(dataframe=r)
|
||
|
||
def dropna(self, axis=0, how='any', thresh=None, subset=None, inplace=False):
|
||
"""
|
||
Remove missing values.
|
||
|
||
:param axis: (*int*) {0 or ‘index’, 1 or ‘columns’}, default 0. Determine if rows or columns which contain
|
||
missing values are removed.
|
||
:param how: (*str*) {‘any’, ‘all’}, default ‘any’. Determine if row or column is removed from DataFrame,
|
||
when we have at least one NA or all NA.
|
||
:param thresh: (*int*) Optional. Require that many non-NA values.
|
||
:param subset: (*list*) column label or sequence of labels, optional. Labels along other axis to consider,
|
||
e.g. if you are dropping rows these would be a list of columns to include.
|
||
:param inplace: (*bool*) default False. If True, do operation inplace and return None.
|
||
|
||
:return: (*DataFrame*) DataFrame with NA entries dropped from it or None if inplace=True.
|
||
"""
|
||
row = (axis == 0 or axis == 'index')
|
||
any = (how == 'any')
|
||
if any:
|
||
r = self._dataframe.dropNAAny(row)
|
||
else:
|
||
r = self._dataframe.dropNAAll(row)
|
||
return DataFrame(dataframe=r)
|
||
|
||
def replace(self, to_replace, value):
|
||
"""
|
||
Replace values given in to_replace with value.
|
||
|
||
:param to_replace: (*object*) The value to be replaced.
|
||
:param value: (*object*) The replacing value.
|
||
|
||
:return: (*DataFrame*) New data frame with after value replaced.
|
||
"""
|
||
r = self._dataframe.replace(to_replace, value)
|
||
return DataFrame(dataframe=r)
|
||
|
||
def append(self, other):
|
||
"""
|
||
Append another data frame.
|
||
|
||
:param other: (*DataFrame, dict, list*) Other data frame or row data.
|
||
|
||
:returns: (*DataFrame*) Appended data frame.
|
||
"""
|
||
if isinstance(other, DataFrame):
|
||
r = self._dataframe.append(other._dataframe)
|
||
return DataFrame(dataframe=r)
|
||
else:
|
||
self._dataframe.append(other)
|
||
return self
|
||
|
||
def describe(self):
|
||
"""
|
||
Generates descriptive statistics that summarize the central tendency, dispersion and shape of a
|
||
dataset’s distribution, excluding NaN values.
|
||
|
||
:returns: Describe DataFrame.
|
||
"""
|
||
r = self._dataframe.describe()
|
||
return DataFrame(dataframe=r)
|
||
|
||
def sort_index(self, axis=0, ascending=True):
|
||
"""
|
||
Sort by the index along either axis
|
||
|
||
:param axis: (*int*) Axis to be sorted {0 or ‘index’, 1 or ‘columns’}, default 0
|
||
:param ascending: (*boolean*) Sort ascending vs. descending.
|
||
|
||
:returns: Sorted DataFrame
|
||
"""
|
||
df = self._dataframe.sortByIndex(ascending)
|
||
return DataFrame(dataframe=df)
|
||
|
||
def sort_values(self, by, axis=0, ascending=True):
|
||
"""
|
||
Sort by the values along either axis
|
||
|
||
:param by: (*string or list of string*) Name or list of names to sort by.
|
||
:param axis: (*int*) Axis to be sorted {0 or ‘index’, 1 or ‘columns’}, default 0
|
||
:param ascending: (*boolean*) Sort ascending vs. descending. Specify list for multiple sort orders.
|
||
If this is a list of bools, must match the length of the by.
|
||
|
||
:returns: Sorted DataFrame
|
||
"""
|
||
if isinstance(by, basestring):
|
||
by = [by]
|
||
if isinstance(ascending, bool):
|
||
ascending = [ascending] * len(by)
|
||
df = self._dataframe.sortBy(by, ascending)
|
||
return DataFrame(dataframe=df)
|
||
|
||
def reindex(self, index=None, columns=None, axis=None):
|
||
"""
|
||
Conform DataFrame to new index with optional filling logic.
|
||
|
||
:param index: (*array-like*) New labels for the index. Preferably an Index object to avoid
|
||
duplicating data.
|
||
:param columns: (*array-like*) New labels for the columns. Preferably an Index object to
|
||
avoid duplicating data.
|
||
:param axis: (*int or str*) Axis to target. Can be either the axis name (‘index’, ‘columns’)
|
||
or number (0, 1).
|
||
|
||
:return: DataFrame with changed index.
|
||
"""
|
||
if index is None:
|
||
index = slice(None)
|
||
|
||
k = index
|
||
rkeys = index
|
||
if isinstance(k, slice):
|
||
sidx = 0 if k.start is None else self._index.index(k.start)
|
||
if sidx < 0:
|
||
raise KeyError(key)
|
||
eidx = self.shape[0] - 1 if k.stop is None else self._index.index(k.stop)
|
||
if eidx < 0:
|
||
raise KeyError(key)
|
||
step = 1 if k.step is None else k.step
|
||
rowkey = Range(sidx, eidx, step)
|
||
else:
|
||
rowkey = self._index.get_indexer(k)
|
||
if len(rowkey) == 0:
|
||
raise KeyError(key)
|
||
|
||
k = columns
|
||
if k is None:
|
||
colkey = range(0, self.shape[1], 1)
|
||
else:
|
||
if isinstance(k, slice):
|
||
sidx = 0 if k.start is None else self.columns.indexOfName(k.start)
|
||
if sidx < 0:
|
||
raise KeyError(key)
|
||
eidx = self.shape[1] - 1 if k.stop is None else self.columns.indexOfName(k.stop)
|
||
if eidx < 0:
|
||
raise KeyError(key)
|
||
step = 1 if k.step is None else k.step
|
||
colkey = Range(sidx, eidx, step)
|
||
elif isinstance(k, list):
|
||
colkey = self.columns.indexOfName(k)
|
||
elif isinstance(k, basestring):
|
||
col = self.columns.indexOfName(k)
|
||
if col < 0:
|
||
raise KeyError(key)
|
||
colkey = [col]
|
||
else:
|
||
return None
|
||
|
||
if isinstance(rowkey, (int, Range)):
|
||
r = self._dataframe.select(rowkey, colkey)
|
||
else:
|
||
if isinstance(colkey, Range):
|
||
ncol = colkey.length()
|
||
else:
|
||
ncol = len(colkey)
|
||
|
||
if rkeys is None:
|
||
r = self._dataframe.select(rowkey, colkey)
|
||
else:
|
||
if not isinstance(rkeys, list):
|
||
rkeys = [rkeys]
|
||
if columns is None:
|
||
columns = self.columns.names
|
||
r = self._dataframe.reIndex(rkeys, rowkey, colkey, columns)
|
||
|
||
if r is None:
|
||
return None
|
||
|
||
if isinstance(r, MISeries):
|
||
r = series.Series(series=r)
|
||
else:
|
||
r = DataFrame(dataframe=r)
|
||
|
||
return r
|
||
|
||
def groupby(self, by):
|
||
"""
|
||
Group DataFrame.
|
||
|
||
:param by: Period string.
|
||
|
||
:returns: GroupBy object.
|
||
"""
|
||
if isinstance(by, basestring):
|
||
by = [by]
|
||
gb = self._dataframe.groupBy(by)
|
||
return groupby.GroupBy(gb)
|
||
|
||
def resample(self, by):
|
||
"""
|
||
Group DataFrame by date time index.
|
||
|
||
:param by: Used to determine the groups for the groupby.
|
||
|
||
:returns: GroupBy object
|
||
"""
|
||
gb = self._dataframe.groupByIndex(by)
|
||
return groupby.GroupBy(gb)
|
||
|
||
def count(self):
|
||
"""
|
||
Return the count of the values for the requested axis
|
||
"""
|
||
return DataFrame(dataframe=self._dataframe.count())
|
||
|
||
def sum(self):
|
||
"""
|
||
Return the sum of the values for the requested axis
|
||
"""
|
||
return DataFrame(dataframe=self._dataframe.sum())
|
||
|
||
def mean(self):
|
||
"""
|
||
Return the mean of the values for the requested axis
|
||
"""
|
||
return DataFrame(dataframe=self._dataframe.mean())
|
||
|
||
def min(self):
|
||
"""
|
||
Return the minimum of the values for the requested axis
|
||
"""
|
||
return DataFrame(dataframe=self._dataframe.min())
|
||
|
||
def max(self):
|
||
"""
|
||
Return the maximum of the values for the requested axis
|
||
"""
|
||
return DataFrame(dataframe=self._dataframe.max())
|
||
|
||
def median(self):
|
||
"""
|
||
Return the median of the values for the requested axis
|
||
"""
|
||
return DataFrame(dataframe=self._dataframe.median())
|
||
|
||
def std(self):
|
||
"""
|
||
Return the standard deviation of the values for the requested axis
|
||
"""
|
||
return DataFrame(dataframe=self._dataframe.stdDev())
|
||
|
||
def isin(self, values):
|
||
"""
|
||
Whether elements in DataFrame are contained in values.
|
||
|
||
Return a boolean DataFrame showing whether each element in the DataFrame matches an element in the passed
|
||
sequence of values exactly.
|
||
|
||
:param values: (*list*) The sequence of values to test.
|
||
:return: (*DataFrame*) DataFrame of booleans indicating if each element is in values.
|
||
"""
|
||
return DataFrame(dataframe=self._dataframe.isIn(values))
|
||
|
||
@classmethod
|
||
def read_table(cls, filepath, **kwargs):
|
||
"""
|
||
Create DataFrame by reading column oriented data from a file.
|
||
|
||
:param filepath: (*string*) File path for reading.
|
||
:param delimiter: (*string*) Variable delimiter character. Default is ``None``, means space or tab
|
||
delimiter.
|
||
:param format: (*string*) Colomn format of the file. Default is ``None``, means all columns were
|
||
read as string variable. ``%s``: string; ``%i``: integer; ``%f``: float; ``%{yyyyMMdd...}D``:
|
||
date time.
|
||
:param skiprows: (*int*) Lines to skip at beginning of the file. Default is ``0``.
|
||
:param skipfooter: (*int*) Number of lines at bottom of file to skip.
|
||
:param encoding: (*string*) Character encoding scheme associated with the file. Default is ``UTF8``.
|
||
:param names: (*array_like*) List of column names to use. If file contains no header row, then you should
|
||
explicitly pass header=None. Default is None.
|
||
:param header: (*int*) Row number to use as the column names. If column names are passed explicitly
|
||
then the behavior is identical to ``header=None``.
|
||
:param index_col: (*int*) Column to use as the row labels (index) of the DataFrame.
|
||
:param index_format: (*string*) Index column format.
|
||
:param usecols: (*list*) Return a subset of the columns. If list-like, all elements
|
||
must either be positional (i.e. integer indices into the document columns) or
|
||
strings that correspond to column names provided either by the user in names or
|
||
inferred from the document header row(s).
|
||
|
||
:returns: (*DataFrame*) The DataFrame.
|
||
"""
|
||
delimiter = kwargs.pop('delimiter', None)
|
||
format = kwargs.pop('format', None)
|
||
skiprows = kwargs.pop('skiprows', 0)
|
||
skipfooter = kwargs.pop('skipfooter', 0)
|
||
encoding = kwargs.pop('encoding', None)
|
||
names = kwargs.pop('names', None)
|
||
header = kwargs.pop('header', 0)
|
||
index_col = kwargs.pop('index_col', -1)
|
||
index_format = kwargs.pop('index_format', None)
|
||
usecols = kwargs.pop('usecols', None)
|
||
if usecols is None:
|
||
midf = MIDataFrame.readTable(filepath, delimiter, skiprows, format, encoding,
|
||
index_col, index_format, names, header, skipfooter)
|
||
else:
|
||
midf = MIDataFrame.readTable(filepath, delimiter, skiprows, format, encoding,
|
||
index_col, index_format, names, header, skipfooter, usecols)
|
||
return DataFrame(dataframe=midf)
|
||
|
||
def to_csv(self, filepath, delimiter=',', format=None, date_format=None, \
|
||
float_format=None, index=True):
|
||
"""
|
||
Save the data to a csv file.
|
||
|
||
:param filepath: (*string*) The output file path.
|
||
:param delimiter: (*string*) Field delimiter character. Default is ``,``.
|
||
:param format: (*string*) Format string.
|
||
:param date_format: (*string*) Date format string. i.e. 'yyyyMMddHH'.
|
||
:param float_format: (*string*) Float format string. i.e. '%.2f'.
|
||
:param index: (*boolean*) Write index or not.
|
||
"""
|
||
self._dataframe.saveCSV(filepath, delimiter, format, date_format, float_format, index)
|
||
|
||
################################################################# |