Processing math: 100%

NOTES ON STATISTICS, PROBABILITY and MATHEMATICS


Mahalanobis Distance:


From Wikipedia:

The Mahalanobis distance is a measure of the distance between a point P and a distribution D, introduced by P. C. Mahalanobis in 1936. Mahalanobis’s definition was prompted by the problem of identifying the similarities of skulls based on measurements in 1927.[2]

It is a multi-dimensional generalization of the idea of measuring how many standard deviations away P is from the mean of D. This distance is zero for P at the mean of D and grows as P moves away from the mean along each principal component axis. If each of these axes is re-scaled to have unit variance, then the Mahalanobis distance corresponds to standard Euclidean distance in the transformed space. The Mahalanobis distance is thus unitless, scale-invariant, and takes into account the correlations of the data set.

Given a probability distribution Q on RN, with mean →μ=(μ1,μ2,μ3,…,μN)⊤ and positive-definite covariance matrix S, the Mahalanobis distance of a point →x=(x1,x2,x3,…,xN)T from Q is

dM(→x,Q)=√(→x−→μ)TS−1(→x−→μ). The inverse of the covariance matrix makes sense because we want to revert the effect of the covariance, and then calculate the Euclidean distances.


PLOTTING ELLIPSES:


# From library(chemometrics) just type drawMahal in the console
library(robustbase)
x <- mtcars[,c(1,6)]
head(x)
##                    mpg    wt
## Mazda RX4         21.0 2.620
## Mazda RX4 Wag     21.0 2.875
## Datsun 710        22.8 2.320
## Hornet 4 Drive    21.4 3.215
## Hornet Sportabout 18.7 3.440
## Valiant           18.1 3.460
cent <- covMcd(x)$center #  Minimum Covariance Determinant (MCD) estimator
s <- covMcd(x)$cov
s.svd <- svd(s)
r <- s.svd[["u"]] %*% diag(sqrt(s.svd[["d"]])) # s.svd[["u"]] eigenvectors of s x eigenvals
quantile <- c(1 - 0.5/2, 1 - 0.25/2, 1 - 0.05/2)  
# 0.975 is 1 - 0.05/2 = 95%
alphamd <- sqrt(qchisq(quantile, df = 2))
nquant <- length(quantile)
m <- 1000 # No. points in the ellipse

mat <- matrix(0, m + 1, 0)
for (j in 1:nquant) {
    e1md <- cos(c(0:m)/m * 2 * pi) * alphamd[j]
    e2md <- sin(c(0:m)/m * 2 * pi) * alphamd[j]
    emd <- cbind(e1md, e2md)
    ttmd <- t(r %*% t(emd)) + rep(1, m + 1) %o% cent
    mat <- cbind(mat, ttmd)
}
minx <- min(mat[,c(T,F)])
maxx <- max(mat[,c(T,F)])
miny <- min(mat[,c(F,T)])
maxy <- max(mat[,c(F,T)])
plot(x,
     xlim=c(min(minx,x[,1]), max(maxx,x[,1])), 
     ylim=c(min(miny,x[,2]), max(maxy, x[,2])), 
     xlab = "Miles per Gallon",
     ylab = "Weight of car (in metric tonnes)",
     col = "blue" , pch = 19, cex=0.5)    
text(x, labels = rownames(mtcars), 
     col=rgb(0,0,1, 0.3), cex = 0.9)

for(i in 1:nquant){
  mx <- mat[,c(T,F)][,i]
  my <- mat[,c(F,T)][,i]
  lines(mx, my, type = "l", col = rgb(1/nquant*i,0,0.5,0.2), lwd = 20)
}

CALCULATING DISTANCE:

library(assertr)
maha_dist(mtcars, robust=TRUE)
##  [1]   16.900411   20.459267    9.146248   29.297695    8.876131   24.790601
##  [7]  121.478738  175.039432   28.076750   16.643037   17.484597   22.965004
## [13]    9.686254   15.237039   17.990925   19.112670   20.021190   22.820337
## [19]  187.893154   20.012898  420.103326   14.197465   41.340860  118.295449
## [25]   11.657889   16.756740  164.366741  335.749683  869.157574  341.368373
## [31] 1450.779293   19.964501
maha_dist(mtcars, robust=FALSE)
##  [1]  8.946673  8.287933  8.937150  6.096726  5.429061  8.877558  9.136276
##  [8] 10.030345 22.593116 12.393107 11.058878  9.476126  5.594527  6.026462
## [15] 11.201310  8.672093 12.257618  9.078630 14.954377 10.296463 13.432391
## [22]  6.227235  5.786691 11.681526  6.718085  3.645789 18.356164 14.000669
## [29] 21.573003 11.152850 19.192384  9.888781
# From https://builtin.com/data-science/mahalanobis-distance

cars.center <- colMeans(mtcars)
cars.cov <- cov(mtcars)

# The squared distances are:
distances <- mahalanobis(x = mtcars , center = cars.center, 
                         cov = cars.cov)

scl <- scale(mtcars,center = T, scale = F)

m <- matrix(0, nrow(mtcars),1)
for(i in 1:nrow(mtcars))
m[i,] <- as.matrix(t(scl[i,])) %*% solve(cars.cov) %*% as.matrix(scl[i,])
t(m)
##          [,1]     [,2]    [,3]     [,4]     [,5]     [,6]     [,7]     [,8]
## [1,] 8.946673 8.287933 8.93715 6.096726 5.429061 8.877558 9.136276 10.03034
##          [,9]    [,10]    [,11]    [,12]    [,13]    [,14]    [,15]    [,16]
## [1,] 22.59312 12.39311 11.05888 9.476126 5.594527 6.026462 11.20131 8.672093
##         [,17]   [,18]    [,19]    [,20]    [,21]    [,22]    [,23]    [,24]
## [1,] 12.25762 9.07863 14.95438 10.29646 13.43239 6.227235 5.786691 11.68153
##         [,25]    [,26]    [,27]    [,28]  [,29]    [,30]    [,31]    [,32]
## [1,] 6.718085 3.645789 18.35616 14.00067 21.573 11.15285 19.19238 9.888781
# Compare to
distances
##           Mazda RX4       Mazda RX4 Wag          Datsun 710      Hornet 4 Drive 
##            8.946673            8.287933            8.937150            6.096726 
##   Hornet Sportabout             Valiant          Duster 360           Merc 240D 
##            5.429061            8.877558            9.136276           10.030345 
##            Merc 230            Merc 280           Merc 280C          Merc 450SE 
##           22.593116           12.393107           11.058878            9.476126 
##          Merc 450SL         Merc 450SLC  Cadillac Fleetwood Lincoln Continental 
##            5.594527            6.026462           11.201310            8.672093 
##   Chrysler Imperial            Fiat 128         Honda Civic      Toyota Corolla 
##           12.257618            9.078630           14.954377           10.296463 
##       Toyota Corona    Dodge Challenger         AMC Javelin          Camaro Z28 
##           13.432391            6.227235            5.786691           11.681526 
##    Pontiac Firebird           Fiat X1-9       Porsche 914-2        Lotus Europa 
##            6.718085            3.645789           18.356164           14.000669 
##      Ford Pantera L        Ferrari Dino       Maserati Bora          Volvo 142E 
##           21.573003           11.152850           19.192384            9.888781

TESTING FOR OUTLIERS:

# Cutoff value for distances from Chi-Square Dist. 
# with p = 0.95 df = 2 which in ncol(air)
cutoff <- qchisq(p = 0.95 , df = ncol(mtcars))

## Display observation whose distance greater than cutoff value
mtcars[distances > cutoff ,]
##                 mpg cyl  disp  hp drat   wt qsec vs am gear carb
## Merc 230       22.8   4 140.8  95 3.92 3.15 22.9  1  0    4    2
## Ford Pantera L 15.8   8 351.0 264 4.22 3.17 14.5  0  1    5    4

Home Page

NOTE: These are tentative notes on different topics for personal use - expect mistakes and misunderstandings.