# Copyright 2022 Cognite AS
"""
===========================================
Detect out of range outliers in sensor data
===========================================
Introduction
------------
The :func:`out_of_range` function uses Savitzky-Golay smoothing (SG) - :func:`sg` to estimate the main trend of the data:
The algorithm carries out two iterations to determine the trend and detect extreme outliers.
The outlier detection is based on the |studentized residuals| and the |bonferroni correction|. The residuals between
the original data and the estimated trend are studentized and the Bonferroni Correction is used to identify potential
outliers.
Note that the `Out of Range` algorithm is designed to be used with *non-linear, non-stationary* sensor data. Therefore,
lets start by generating a synthetic signal with those characteristics. We will use some of the signal generator
functions in InDSL. We also need a few additional InDSL helper functions and algorithms to demonstrate how the outlier
detection works step by step.
.. |studentized residuals| raw:: html
studentized residuals
.. |bonferroni correction| raw:: html
Bonferroni Correction
.. |Student's-t distribution| raw:: html
Student's-t distribution
.. |Numpy's| raw :: html
Numpy's
"""
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.stats import t as student_dist
from datasets.data.synthetic_industrial_data import non_linear_non_stationary_signal
from indsl.data_quality.outliers import (
_calculate_hat_diagonal,
_calculate_residuals_and_normalize_them,
_split_timeseries_into_time_and_value_arrays,
out_of_range,
)
from indsl.resample import reindex
from indsl.smooth import sg
##############################################################################
# Non-linear, non-stationary synthetic signal
# -------------------------------------------
# We'll use a pre-defined test data set with a non-linear, non-stationary time series with "`industrial`"
# characteristics. This data set is a time series composed of 3 oscillatory components, 2 nonlinear trends, sensor
# linear drift (small decrease over time) and white noise. The signal has non-uniform
# time stamps, a fraction (35%) of the data is randomly removed to generate data gaps. The data gaps and white
# noise are inserted with a constant seed to have a reproducible behavior of the algorithm. The functions used
# to generate this signal are also part of InDSL: :func:`insert_data_gaps`, :func:`line`, :func:`perturb_timestamp`,
# :func:`sine_wave`, and :func:`white_noise`.
seed_array = [10, 1975, 2000, 6000, 1, 89756]
seed = seed_array[4]
data = non_linear_non_stationary_signal(seed=seed)
# A simple function to style and annotate the figures.
def style_and_annotate_figure(
ax,
text,
x_pos=0.50,
y_pos=0.9,
fsize=12,
fsize_annotation=18,
title_fsize=14,
ylimits=[-3005, 8000],
title_txt=None,
):
ax.text(
x_pos, y_pos, text, transform=ax.transAxes, ha="center", va="center", fontsize=fsize_annotation, color="dimgrey"
)
ax.legend(fontsize=fsize, loc=4)
ax.tick_params(axis="both", which="major", labelsize=fsize)
ax.tick_params(axis="both", which="minor", labelsize=fsize)
ax.tick_params(axis="x", rotation=45)
ax.set_title(title_txt, fontsize=title_fsize)
ax.grid()
ax.set_ylim(ylimits)
##############################################################################
# Insert extreme outliers
# -----------------------
# This algorithm was tested with outliers generated at different locations. Five percent (5%) of the data points at
# random locations were replaced by outliers. To do so we used |Numpy's| random generator with different seeds.
# Feel free to use one of the 5 seeds below. The algorithm will work with 100% precision for these conditions and
# parameters used in the example. Or use another seed to generate a different signal and further test the
# limits of the algorithm.
data = data.dropna()
rng = np.random.default_rng(seed)
outlier_fraction = 0.05 # Fraction of the signal that will be replaced by outliers
num_outliers = round(len(data) * outlier_fraction)
locations = np.unique(rng.integers(low=0, high=len(data), size=num_outliers))
direction = rng.choice([1, -1], size=len(locations))
outliers = data.iloc[locations] + data.mean() * rng.uniform(0.5, 1, len(locations)) * direction
data_w_outliers = data.copy()
data_w_outliers[locations] = outliers
##############################################################################
# Initial conditions: test data set
# *********************************
# The figure below shows the original data set and the outliers inserted. We took care to give the outliers random
# values, both far and close to the main trend. But far enough for these to be categorized as an extreme deviation from
# the expected behavior of the data.
fig_size = (9, 7)
fig, ax = plt.subplots(figsize=fig_size)
ax.plot(data, linewidth=2, label="Original data set")
ax.plot(outliers, "ro", markersize=3, label="Outliers inserted")
outliers_inserted = (
f"{len(outliers)} outliers inserted ({round(100 * len(outliers) / len(data), 1)}% of the data points)"
)
style_and_annotate_figure(ax, text=outliers_inserted, title_txt="Test Data Set")
##############################################################################
# Initial iteration
# -----------------
#
# 1. Trend estimate
# *****************
# As mentioned before, we will use a SG smoother to estimate the trend. To demonstrate how the initial pass works, we'll run the SG independently. The SG smoother
# requires a point-wise window length and a polynomial order. The bigger the window, more data
# used to estimate the local trend. With the polynomial order we influence how much we want the fit to follow the
# non-linear characteristics of the data (1, linear, >1 increasingly non-linear fit). In this case we will use a window
# of 20 data points and 3rd order polynomial fit.
# Estimate the trend using Savitzky-Golay smoother
tolerance = 0.08
trend_pass01 = sg(data_w_outliers, window_length=20, polyorder=3)
##############################################################################
# 2. Studentized residuals and Bonferroni correction
# **************************************************
# Identifying potential outliers is done by comparing how much each data point deviates from the estimated
# main trend (i.e. the residuals). However, since in most cases little information about the data is readily
# available and extreme outliers are expected to be sparse and uncommon, the |Student's-t distribution| is well
# suited for the current task, where the sample size is small and the standard deviation is
# unknown. To demonstrate how the residuals are studentized, we use a helper function from InDSL.
# But these steps are integrated into the :func:`out_of_range` function.
# Finally, since we aim to identify extreme outliers, a simple t-test does not suffice. Hence the Bonferroni Correction.
# Furthermore, we use a relaxation factor for the Bonferroni factor estimated from the data to adjust the sensitivity
# of the correction. Again, the Bonferroni Correction is explicitly calculated here but it is integrated into the
# :func:`out_of_range` function.
# Statistical parameters
alpha = 0.05 # Significance level or probability of rejecting the null hypothesis when true.
bc_relaxation = 1 / 4 # Bonferroni relaxation coefficient.
x, y = _split_timeseries_into_time_and_value_arrays(data_w_outliers)
y_pred_pass01 = trend_pass01.to_numpy()
hat_diagonal = _calculate_hat_diagonal(x)
# Calculate degrees of freedom (n-p-1)
n = len(y)
dof = n - 3 # Using p = 2 for a model based on a single time series
# Calculate Bonferroni critical value and studentized residuals
bc = student_dist.ppf(1 - alpha / (2 * n), df=dof) * bc_relaxation
t_res = _calculate_residuals_and_normalize_them(dof, hat_diagonal, y, y_pred_pass01)
# Boolean mask where outliers are detected
mask = np.logical_and(t_res < bc, t_res > -bc)
filtered_ts_pass01 = pd.Series(y[mask], index=data.index[mask]) # Remove detected outliers from time series
outliers_pass01 = pd.Series(y[~mask], index=data.index[~mask])
##############################################################################
# 3. Outliers detected with the initial pass
# ******************************************
# The figure below shows the results of the initial pass. The SG method does a good job at estimating the trend, except
# for a few periods in the data where a larger number of outliers are grouped together. This causes strong nonlinear
# behavior in the estimated trend, and as a consequence some data points are miss-identified as outliers. But overall,
# a good enough baseline.
fig2, ax2 = plt.subplots(figsize=fig_size)
ax2.plot(data_w_outliers, "--", color="orange", label="Data with outliers")
ax2.plot(trend_pass01, "k", linewidth=2, label="Savitzky-Golay trend")
ax2.plot(outliers_pass01, "wo", markersize=7, alpha=0.85, mew=2, markeredgecolor="green", label="Outliers detected")
ax2.plot(outliers, "ro", markersize=3, label="Outliers inserted")
text_outlier_res = (
f"{len(outliers_pass01)} out of {len(outliers)} outliers detected "
f"({round(100 * len(outliers_pass01) / len(outliers), 1)}%)"
)
style_and_annotate_figure(ax2, text=text_outlier_res, title_txt="First Iteration: Savitzky-Golay trend")
##############################################################################
# Last iteration
# --------------
# For the last iteration the outliers previously detected are removed and then use the SG method to estimate the main trend.
tolerance_pass02 = 0.01
trend_pass02 = sg(filtered_ts_pass01, window_length=20, polyorder=3)
# Filtering parameters
alpha_pass02 = 0.05
bc_relaxation_pass02 = 1 / 2
bc_pass02 = student_dist.ppf(1 - alpha_pass02 / (2 * n), df=dof) * bc_relaxation_pass02
y_pred_pass02 = reindex(trend_pass02, data_w_outliers)
y_pred_pass02 = y_pred_pass02.to_numpy()
t_res_pass02 = _calculate_residuals_and_normalize_them(dof, hat_diagonal, y, y_pred_pass02)
# Boolean mask where outliers are detected
mask_pass02 = np.logical_and(t_res_pass02 < bc_pass02, t_res_pass02 > -bc_pass02)
filtered_ts_pass02 = pd.Series(y[mask_pass02], index=data.index[mask_pass02])
# Remove detected outliers from time series
outliers_pass02 = pd.Series(y[~mask_pass02], index=data.index[~mask_pass02])
# Run the InDSL function that carries out the entire analysis with the same parameters
indsl_outliers = out_of_range(data_w_outliers)
##############################################################################
# Results
# -------
# The figure below shows the original data, the trend estimated using the SG method, the outliers artificially
# inserted, and the outliers detected by the full method (:func:`out_of_range`). A perfect performance is observed, all
# outliers are detected. This "perfect" performance will not always be the case but this function provides a very
# robust option for detecting and removing out of range or extreme outliers in *non-linear,
# non-stationary sensor data*.
# sphinx_gallery_thumbnail_number = 3
fig3, ax3 = plt.subplots(figsize=fig_size)
ax3.plot(data_w_outliers, "--", color="orange", label="Data with outliers")
ax3.plot(trend_pass02, "k", linewidth=2, label="Savitzky-Golay trend")
ax3.plot(
indsl_outliers,
"wo",
markersize=7,
alpha=0.85,
mew=2,
markeredgecolor="green",
label="Outliers detected",
)
ax3.plot(outliers, "ro", markersize=3, label="Outliers inserted")
text_outlier_res = (
f"{len(indsl_outliers)} out of {len(outliers)} outliers detected "
f"({round(100 * len(indsl_outliers) / len(outliers), 1)}%)"
)
style_and_annotate_figure(ax3, text=text_outlier_res, title_txt="Final Iteration: EMD-HHT trend")
plt.show()