NOTES ON STATISTICS, PROBABILITY and MATHEMATICS


Mahalanobis Distance:


From Wikipedia:

The Mahalanobis distance is a measure of the distance between a point \(P\) and a distribution \(D,\) introduced by P. C. Mahalanobis in 1936. Mahalanobis’s definition was prompted by the problem of identifying the similarities of skulls based on measurements in 1927.[2]

It is a multi-dimensional generalization of the idea of measuring how many standard deviations away \(P\) is from the mean of \(D.\) This distance is zero for \(P\) at the mean of \(D\) and grows as \(P\) moves away from the mean along each principal component axis. If each of these axes is re-scaled to have unit variance, then the Mahalanobis distance corresponds to standard Euclidean distance in the transformed space. The Mahalanobis distance is thus unitless, scale-invariant, and takes into account the correlations of the data set.

Given a probability distribution \(Q\) on \(\mathbb{R} ^{N},\) with mean \(\vec \mu = (\mu_1, \mu_2,\mu_3,…,\mu_N)^\top\) and positive-definite covariance matrix \(S,\) the Mahalanobis distance of a point \({\displaystyle {\vec {x}}=(x_{1},x_{2},x_{3},\dots ,x_{N})^{\mathsf {T}}}\) from \(Q\) is

\[{\displaystyle d_{M}({\vec {x}},Q)={\sqrt {({\vec {x}}-{\vec {\mu }})^{\mathsf {T}}S^{-1}({\vec {x}}-{\vec {\mu }})}}.}\] The inverse of the covariance matrix makes sense because we want to revert the effect of the covariance, and then calculate the Euclidean distances.


PLOTTING ELLIPSES:


# From library(chemometrics) just type drawMahal in the console
library(robustbase)
x <- mtcars[,c(1,6)]
head(x)
##                    mpg    wt
## Mazda RX4         21.0 2.620
## Mazda RX4 Wag     21.0 2.875
## Datsun 710        22.8 2.320
## Hornet 4 Drive    21.4 3.215
## Hornet Sportabout 18.7 3.440
## Valiant           18.1 3.460
cent <- covMcd(x)$center #  Minimum Covariance Determinant (MCD) estimator
s <- covMcd(x)$cov
s.svd <- svd(s)
r <- s.svd[["u"]] %*% diag(sqrt(s.svd[["d"]])) # s.svd[["u"]] eigenvectors of s x eigenvals
quantile <- c(1 - 0.5/2, 1 - 0.25/2, 1 - 0.05/2)  
# 0.975 is 1 - 0.05/2 = 95%
alphamd <- sqrt(qchisq(quantile, df = 2))
nquant <- length(quantile)
m <- 1000 # No. points in the ellipse

mat <- matrix(0, m + 1, 0)
for (j in 1:nquant) {
    e1md <- cos(c(0:m)/m * 2 * pi) * alphamd[j]
    e2md <- sin(c(0:m)/m * 2 * pi) * alphamd[j]
    emd <- cbind(e1md, e2md)
    ttmd <- t(r %*% t(emd)) + rep(1, m + 1) %o% cent
    mat <- cbind(mat, ttmd)
}
minx <- min(mat[,c(T,F)])
maxx <- max(mat[,c(T,F)])
miny <- min(mat[,c(F,T)])
maxy <- max(mat[,c(F,T)])
plot(x,
     xlim=c(min(minx,x[,1]), max(maxx,x[,1])), 
     ylim=c(min(miny,x[,2]), max(maxy, x[,2])), 
     xlab = "Miles per Gallon",
     ylab = "Weight of car (in metric tonnes)",
     col = "blue" , pch = 19, cex=0.5)    
text(x, labels = rownames(mtcars), 
     col=rgb(0,0,1, 0.3), cex = 0.9)

for(i in 1:nquant){
  mx <- mat[,c(T,F)][,i]
  my <- mat[,c(F,T)][,i]
  lines(mx, my, type = "l", col = rgb(1/nquant*i,0,0.5,0.2), lwd = 20)
}

CALCULATING DISTANCE:

library(assertr)
maha_dist(mtcars, robust=TRUE)
##  [1]   16.900411   20.459267    9.146248   29.297695    8.876131   24.790601
##  [7]  121.478738  175.039432   28.076750   16.643037   17.484597   22.965004
## [13]    9.686254   15.237039   17.990925   19.112670   20.021190   22.820337
## [19]  187.893154   20.012898  420.103326   14.197465   41.340860  118.295449
## [25]   11.657889   16.756740  164.366741  335.749683  869.157574  341.368373
## [31] 1450.779293   19.964501
maha_dist(mtcars, robust=FALSE)
##  [1]  8.946673  8.287933  8.937150  6.096726  5.429061  8.877558  9.136276
##  [8] 10.030345 22.593116 12.393107 11.058878  9.476126  5.594527  6.026462
## [15] 11.201310  8.672093 12.257618  9.078630 14.954377 10.296463 13.432391
## [22]  6.227235  5.786691 11.681526  6.718085  3.645789 18.356164 14.000669
## [29] 21.573003 11.152850 19.192384  9.888781
# From https://builtin.com/data-science/mahalanobis-distance

cars.center <- colMeans(mtcars)
cars.cov <- cov(mtcars)

# The squared distances are:
distances <- mahalanobis(x = mtcars , center = cars.center, 
                         cov = cars.cov)

scl <- scale(mtcars,center = T, scale = F)

m <- matrix(0, nrow(mtcars),1)
for(i in 1:nrow(mtcars))
m[i,] <- as.matrix(t(scl[i,])) %*% solve(cars.cov) %*% as.matrix(scl[i,])
t(m)
##          [,1]     [,2]    [,3]     [,4]     [,5]     [,6]     [,7]     [,8]
## [1,] 8.946673 8.287933 8.93715 6.096726 5.429061 8.877558 9.136276 10.03034
##          [,9]    [,10]    [,11]    [,12]    [,13]    [,14]    [,15]    [,16]
## [1,] 22.59312 12.39311 11.05888 9.476126 5.594527 6.026462 11.20131 8.672093
##         [,17]   [,18]    [,19]    [,20]    [,21]    [,22]    [,23]    [,24]
## [1,] 12.25762 9.07863 14.95438 10.29646 13.43239 6.227235 5.786691 11.68153
##         [,25]    [,26]    [,27]    [,28]  [,29]    [,30]    [,31]    [,32]
## [1,] 6.718085 3.645789 18.35616 14.00067 21.573 11.15285 19.19238 9.888781
# Compare to
distances
##           Mazda RX4       Mazda RX4 Wag          Datsun 710      Hornet 4 Drive 
##            8.946673            8.287933            8.937150            6.096726 
##   Hornet Sportabout             Valiant          Duster 360           Merc 240D 
##            5.429061            8.877558            9.136276           10.030345 
##            Merc 230            Merc 280           Merc 280C          Merc 450SE 
##           22.593116           12.393107           11.058878            9.476126 
##          Merc 450SL         Merc 450SLC  Cadillac Fleetwood Lincoln Continental 
##            5.594527            6.026462           11.201310            8.672093 
##   Chrysler Imperial            Fiat 128         Honda Civic      Toyota Corolla 
##           12.257618            9.078630           14.954377           10.296463 
##       Toyota Corona    Dodge Challenger         AMC Javelin          Camaro Z28 
##           13.432391            6.227235            5.786691           11.681526 
##    Pontiac Firebird           Fiat X1-9       Porsche 914-2        Lotus Europa 
##            6.718085            3.645789           18.356164           14.000669 
##      Ford Pantera L        Ferrari Dino       Maserati Bora          Volvo 142E 
##           21.573003           11.152850           19.192384            9.888781

TESTING FOR OUTLIERS:

# Cutoff value for distances from Chi-Square Dist. 
# with p = 0.95 df = 2 which in ncol(air)
cutoff <- qchisq(p = 0.95 , df = ncol(mtcars))

## Display observation whose distance greater than cutoff value
mtcars[distances > cutoff ,]
##                 mpg cyl  disp  hp drat   wt qsec vs am gear carb
## Merc 230       22.8   4 140.8  95 3.92 3.15 22.9  1  0    4    2
## Ford Pantera L 15.8   8 351.0 264 4.22 3.17 14.5  0  1    5    4

Home Page

NOTE: These are tentative notes on different topics for personal use - expect mistakes and misunderstandings.