Source code for dSalmon.scalers
# Copyright (c) 2020 CN Group, TU Wien
# Released under the GNU Lesser General Public License version 3,
# see accompanying file LICENSE or <https://www.gnu.org/licenses/>.
"""
Scalers for streaming data.
"""
import numpy as np
from dSalmon import swig as dSalmon_cpp
from dSalmon.util import sanitizeData, sanitizeTimes
[docs]class SWScaler(object):
"""
Base class for sliding window scalers.
"""
def __init__(self, float_type=np.float64):
self.float_type = float_type
self.last_time = 0
self.dimension = -1
def _transform(self, X, times):
raise NotImplementedError()
[docs] def transform(self, X, times=None):
"""
Transform the next chunk of data.
Parameters
----------
X: ndarray, shape (n_samples, n_features)
The input data.
times: ndarray, shape (n_samples,), optional
Timestamps for input data. If None,
timestamps are linearly increased for
each sample.
Returns
-------
X_tr: ndarray, shape (n_samples, n_features)
Transformed input data.
"""
X_tr = sanitizeData(X, self.float_type)
if X_tr is X:
X_tr = X.copy()
assert self.dimension == -1 or X_tr.shape[1] == self.dimension
self.dimension = X_tr.shape[1]
times = sanitizeTimes(times, X_tr.shape[0], self.last_time, self.float_type)
self.last_time = times[-1]
self._transform(X_tr, times)
return X_tr
[docs] def transform_inplace(self, X, times=None):
"""
Transform the next chunk of data in-place. Requires
`X` to be a C-style contiguous `ndarray`.
Parameters
----------
X: ndarray, shape (n_samples, n_features)
The input data.
times: ndarray, shape (n_samples,), optional
Timestamps for input data. If None,
timestamps are linearly increased for
each sample.
Returns
-------
X_tr: ndarray, shape (n_samples, n_features)
Transformed input data. Equal to `X`.
"""
assert isinstance(X, np.ndarray) and len(X.shape) in (0,1,2)
assert X.flags['C_CONTIGUOUS'] and X.flags['WRITEABLE']
if len(X.shape) < 2:
X = X.reshape((1,-1))
assert self.dimension == -1 or X.shape[1] == self.dimension
self.dimension = X.shape[1]
times = sanitizeTimes(times, X.shape[0], self.last_time, self.float_type)
self.last_time = times[-1]
self._transform(X, times)
return X
[docs]class SWZScoreScaler(SWScaler):
"""
Performs z-score normalization of samples based on mean and standard
deviation observed in a sliding window of length `window`.
Parameters
----------
window: float
Window length after which samples will be pruned.
float_type: np.float32 or np.float64
The floating point type to use for internal processing.
"""
def __init__(self, window, float_type=np.float64):
super().__init__(float_type)
self.window = window
cpp_obj = {np.float32: dSalmon_cpp.StatisticsTree32, np.float64: dSalmon_cpp.StatisticsTree64}[float_type]
self.tree = cpp_obj(window)
def _transform(self, X, times):
self.tree.transform_zscore(X, times)
[docs]class SWQuantileScaler(SWScaler):
"""
Performs normalization so that the p-quantile of the current sliding
window is mapped to 0 and the (1-p)-quantile is mapped to 1. If
`quantile==0`, performs minmax normalization. Note that due to its
lacking robustness, minmax normalization is likely to result in unstable
results for stream data.
Parameters
----------
window: float
Window length after which samples will be pruned.
quantile: float with 0 <= quantile < 0.5
The quantile value for computing reference values.
float_type: np.float32 or np.float64
The floating point type to use for internal processing.
"""
def __init__(self, window, quantile, float_type=np.float64):
super().__init__(float_type)
self.window = window
self.quantile = quantile
cpp_obj = {np.float32: dSalmon_cpp.StatisticsTree32, np.float64: dSalmon_cpp.StatisticsTree64}[float_type]
self.tree = cpp_obj(window)
def _transform(self, X, times):
self.tree.transform_quantile(X, times, self.quantile)