Data is from Phylogeny and metabolic scaling in mammals (2010) Capellini et al.
mammal_df <- read.csv("mammal_bmr.csv", sep=",", comment.char='#')
str(mammal_df)
## 'data.frame': 623 obs. of 9 variables:
## $ Subclass : chr "Eutheria" "Eutheria" "Eutheria" "Eutheria" ...
## $ Order : chr "Afrosoricida" "Afrosoricida" "Afrosoricida" "Afrosoricida" ...
## $ Species : chr "Amblysomus hottentotus" "Chrysochloris asiatica" "Echinops telfari" "Geogale aurita" ...
## $ BMR..mlO2.hour. : num 84.7 51.48 133.9 7.73 64.1 ...
## $ Body.mass.for.BMR..gr.: num 70 44 116.4 6.9 133 ...
## $ Reference.for.BMR : chr "White&Seymour03PNAS" "White et al06BiolLett" "White&Seymour03PNAS" "White et al06BiolLett" ...
## $ FMR..kJ.day. : num -9999 -9999 -9999 -9999 -9999 ...
## $ Body.mass.for.FMR..gr.: num -9999 -9999 -9999 -9999 -9999 ...
## $ Reference.for.FMR : chr "-9999" "-9999" "-9999" "-9999" ...
mammal_df$BMR <- mammal_df$BMR..mlO2.hour. # copying BMR column to a new column
mammal_df[(mammal_df$BMR == -9999),] = NA # fix weird method for assigning missing values
mammal_df <- na.omit(mammal_df)
summary(mammal_df$BMR) # maximum BMR is ~51,000
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6.12 45.19 98.60 901.31 368.32 51419.00
mammal_df <- mammal_df %>%
mutate(BMR=log(BMR..mlO2.hour.)) # log transformation the BMRs
sum(mammal_df$BMR > 8) # only 30 samples (out of 580) are larger than 10^8 so I'm going to filter those out since it makes the data very hard to view (long tail)
## [1] 30
mammal_df <- mammal_df %>%
filter(BMR <= 8)
p1 <- ggplot(data=mammal_df, aes(x=BMR, y=..density..)) +
xlim(0,9) +
geom_histogram(color="grey60",fill="dodgerblue3",size=0.2) +
theme(panel.background=element_rect(fill = "aliceblue"))
print(p1)
p1 <- p1 + geom_density(linetype="dotted",size=0.75)
print(p1)
normPars <- fitdistr(mammal_df$BMR,"normal")
print(normPars)
## mean sd
## 4.71494765 1.32549513
## (0.05651930) (0.03996518)
# mean sd
# 4.71494765 1.32549513
# (0.05651930) (0.03996518)
str(normPars)
## List of 5
## $ estimate: Named num [1:2] 4.71 1.33
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ sd : Named num [1:2] 0.0565 0.04
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ vcov : num [1:2, 1:2] 0.00319 0 0 0.0016
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "mean" "sd"
## .. ..$ : chr [1:2] "mean" "sd"
## $ n : int 550
## $ loglik : num -935
## - attr(*, "class")= chr "fitdistr"
normPars$estimate["mean"] # note structure of getting a named attribute
## mean
## 4.714948
# 4.714948
meanML <- normPars$estimate["mean"]
sdML <- normPars$estimate["sd"]
xval <- seq(0,max(mammal_df$BMR),len=length(mammal_df$BMR))
stat <- stat_function(aes(x = xval, y = ..y..), fun = dnorm, colour="red", n = length(mammal_df$BMR), args = list(mean = meanML, sd = sdML))
p1 + stat
Normal distribution doesn’t look awful, but this dataset has no zeroes
so the normal distribution appears skewed right
expoPars <- fitdistr(mammal_df$BMR,"exponential")
rateML <- expoPars$estimate["rate"]
stat2 <- stat_function(aes(x = xval, y = ..y..), fun = dexp, colour="orange2", n = length(mammal_df$BMR), args = list(rate=rateML))
p1 + stat + stat2
This looks even worse than a normal distribution.
stat3 <- stat_function(aes(x = xval, y = ..y..), fun = dunif, colour="green4", n = length(mammal_df$BMR), args = list(min=min(mammal_df$BMR), max=max(mammal_df$BMR)))
p1 + stat + stat2 + stat3
The data is definitely not uniform
gammaPars <- fitdistr(mammal_df$BMR,"gamma")
shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]
stat4 <- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="hotpink3", n = length(mammal_df$BMR), args = list(shape=shapeML, rate=rateML))
p1 + stat + stat2 + stat3 + stat4
Since a gamma distribution is basically a zero bounded normal distribution, this fits pretty well to the data
pSpecial <- ggplot(data=mammal_df, aes(x=BMR/(max(BMR + 0.1)), y=..density..)) +
geom_histogram(color="grey60",fill="dodgerblue3",size=0.2) +
theme(panel.background=element_rect(fill = "aliceblue")) +
xlim(c(0,1)) +
geom_density(size=0.75,linetype="dotted")
betaPars <- fitdistr(x=mammal_df$BMR/max(mammal_df$BMR + 0.1),start=list(shape1=1,shape2=2),"beta")
shape1ML <- betaPars$estimate["shape1"]
shape2ML <- betaPars$estimate["shape2"]
statSpecial <- stat_function(aes(x = xval, y = ..y..), fun = dbeta, colour="orchid", n = length(mammal_df$BMR), args = list(shape1=shape1ML,shape2=shape2ML))
pSpecial + statSpecial
Beta distribution requires rescaling the data to be bounded by 0 and 1, but otherwise this fits the observed probability distribution pretty well, but I don’t think as well as the gamma distribution
Based on the above graphs, I think that the gamma distribution fits the data best. This makes sense since the data is on BMR of mammals which have to be above zero and are also limited by the mammal’s size. This data set might be slightly skewed towards mammals with lower BMR.
# Get maximum likelihood parameters for a gamma distribution
gammaPars <- fitdistr(mammal_df$BMR,"gamma")
print(gammaPars)
## shape rate
## 12.6297495 2.6787878
## ( 0.7517515) ( 0.1626546)
# shape rate
# 12.6297495 2.6787878
# ( 0.7517515) ( 0.1626546)
str(gammaPars)
## List of 5
## $ estimate: Named num [1:2] 12.63 2.68
## ..- attr(*, "names")= chr [1:2] "shape" "rate"
## $ sd : Named num [1:2] 0.752 0.163
## ..- attr(*, "names")= chr [1:2] "shape" "rate"
## $ vcov : num [1:2, 1:2] 0.5651 0.1199 0.1199 0.0265
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "shape" "rate"
## .. ..$ : chr [1:2] "shape" "rate"
## $ loglik : num -922
## $ n : int 550
## - attr(*, "class")= chr "fitdistr"
sim_shape <- gammaPars$estimate["shape"] # 12.6297495
sim_rate <- gammaPars$estimate["rate"] # 2.678788
sim_gamma <- data.frame(rgamma(n=length(mammal_df$BMR), shape=sim_shape, rate=sim_rate))
sim_gamma$BMR <- sim_gamma$rgamma.n...length.mammal_df.BMR...shape...sim_shape..rate...sim_rate. # add new column
p2 <- ggplot(data=sim_gamma, aes(x=BMR, y=..density..)) +
xlim(0,9) +
theme(panel.background=element_rect(fill = "aliceblue")) +
geom_histogram(color="grey60",fill="lightblue",size=0.2) # new histrogram from simulated data
p2 <- p2 + geom_density(linetype="dotted",size=0.75) # add density curve
print(p2)
sim_stat <- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="hotpink3", n = length(sim_gamma$BMR), args = list(shape=sim_shape, rate=sim_rate))
p2 + sim_stat
I think the simulated data does fit the observed data pretty well. However, the observed data is not as nicely bell shaped like the simulated data tries to make it. The observed data might have bias in it and the simulated data looks more like you’d expect. For example, there are 19 different mammal orders included in the data set. If we look at two of them, Carnivoria which tend to be relatively larger and have a lower BMR, and Rodentia which tend to be smaller and have a higher BMR, there are 10x more rodent species included in this data set than Carnivoria species which could cause the wider range of high BMR species.
length(unique(mammal_df$Order))
## [1] 19
sum(mammal_df$Order == "Carnivora")
## [1] 34
sum(mammal_df$Order == "Rodentia")
## [1] 265