## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 5, warning = FALSE, message = FALSE ) ## ----------------------------------------------------------------------------- library(quickOutlier) library(ggplot2) ## ----------------------------------------------------------------------------- # Create data with an obvious outlier set.seed(123) df <- data.frame(val = c(rnorm(50), 100)) # Detect using Z-Score (Standard Deviation) outliers <- detect_outliers(df, "val", method = "zscore", threshold = 3) print(head(outliers)) ## ----------------------------------------------------------------------------- plot_outliers(df, "val", method = "zscore") ## ----------------------------------------------------------------------------- # Scan the entire dataframe scan_data(mtcars, method = "iqr") ## ----------------------------------------------------------------------------- # Create correlated data and add an outlier df_multi <- data.frame(x = 1:20, y = 1:20) df_multi <- rbind(df_multi, data.frame(x = 5, y = 20)) # Anomalous point res_multi <- detect_multivariate(df_multi, c("x", "y")) tail(res_multi, 3) ## ----------------------------------------------------------------------------- # Lower confidence level to make it more sensitive for the demo plot_interactive(df_multi, "x", "y", confidence_level = 0.99) ## ----------------------------------------------------------------------------- # Use the same multi-dimensional data # k = number of neighbors to consider res_lof <- detect_density(df_multi, k = 5, threshold = 1.5) res_lof ## ----------------------------------------------------------------------------- # Generate a 2D blob of data data_ml <- data.frame( feat1 = rnorm(100), feat2 = rnorm(100) ) # Add an extreme outlier data_ml[1, ] <- c(10, 10) # Run Isolation Forest # ntrees = 100 is standard. contamination = 0.05 means we expect ~5% outliers. res_if <- detect_iforest(data_ml, ntrees = 100, contamination = 0.05) # View the outlier score (0 to 1) head(subset(res_if, Is_Outlier == TRUE)) ## ----------------------------------------------------------------------------- # Create a synthetic time series: Sine wave + Noise + Outlier t <- seq(1, 10, length.out = 60) y <- sin(t) + rnorm(60, sd = 0.1) y[30] <- 5 # Spike (Outlier) # Detect using STL Decomposition res_ts <- detect_ts_outliers(y, frequency = 12) # Check the detected outlier subset(res_ts, Is_Outlier == TRUE) ## ----------------------------------------------------------------------------- cities <- c(rep("Madrid", 10), "Barcalona", "Barcelona", "MAdrid") detect_categorical_outliers(cities, min_freq = 0.1) ## ----------------------------------------------------------------------------- # Use mtcars and create a high leverage point cars_df <- mtcars cars_df[1, "wt"] <- 10; cars_df[1, "mpg"] <- 50 infl <- diagnose_influence(cars_df, "mpg", "wt") head(subset(infl, Is_Influential == TRUE)) ## ----------------------------------------------------------------------------- # Create data with an extreme value df_treat <- data.frame(val = c(1, 2, 3, 2, 1, 100)) # Cap values at 1.5 * IQR df_clean <- treat_outliers(df_treat, "val", method = "iqr", threshold = 1.5) print(df_clean$val)