@article{10.7717/peerj.11436,
title = {Mahalanobis distances for ecological niche modelling and outlier detection: implications of sample size, error, and bias for selecting and parameterising a multivariate location and scatter method},
author = {Etherington, Thomas R.},
year = 2021,
month = may,
keywords = {Habitat suitability modelling, Minimum volume ellipsoid, Minimum covariance determinant, Resource selection functions, Species distribution modelling},
abstract = {
The Mahalanobis distance is a statistical technique that has been used in statistics and data science for data classification and outlier detection, and in ecology to quantify species-environment relationships in habitat and ecological niche models. Mahalanobis distances are based on the location and scatter of a multivariate normal distribution, and can measure how distant any point in space is from the centre of this kind of distribution. Three different methods for calculating the multivariate location and scatter are commonly used: the sample mean and variance-covariance, the minimum covariance determinant, and the minimum volume ellipsoid. The minimum covariance determinant and minimum volume ellipsoid were developed to be robust to outliers by minimising the multivariate location and scatter for a subset of the full sample, with the proportion of the full sample forming the subset being controlled by a user-defined parameter. This outlier robustness means the minimum covariance determinant and the minimum volume ellipsoid are highly relevant for ecological niche analyses, which are usually based on natural history observations that are likely to contain errors. However, natural history observations will also contain extreme bias, to which the minimum covariance determinant and the minimum volume ellipsoid will also be sensitive. To provide guidance for selecting and parameterising a multivariate location and scatter method, a series of virtual ecological niche modelling experiments were conducted to demonstrate the performance of each multivariate location and scatter method under different levels of sample size, errors, and bias. The results show that there is no optimal modelling approach, and that choices need to be made based on the individual data and question. The sample mean and variance-covariance method will perform best on very small sample sizes if the data are free of error and bias. At larger sample sizes the minimum covariance determinant and minimum volume ellipsoid methods perform as well or better, but only if they are appropriately parameterised. Modellers who are more concerned about the prevalence of errors should retain a smaller proportion of the full data set, while modellers more concerned about the prevalence of bias should retain a larger proportion of the full data set. I conclude that Mahalanobis distances are a useful niche modelling technique, but only for questions relating to the fundamental niche of a species where the assumption of multivariate normality is reasonable. Users of the minimum covariance determinant and minimum volume ellipsoid methods must also clearly report their parameterisations so that the results can be interpreted correctly.
},
volume = 9,
pages = {e11436},
journal = {PeerJ},
issn = {2167-8359},
url = {https://doi.org/10.7717/peerj.11436},
doi = {10.7717/peerj.11436}
}