
rm(list = ls()) # clean slate
setwd(dirname(rstudioapi::getSourceEditorContext()$path))

source("./DI_code.R")

### Loading VOC data from NHANES website:
# X = as.matrix(SASxport::read.xport("./UVOC_I.XPT"))
# # This matrix is rather large.
# dim(X) # 3279  60
# colnames(X)
# # First column is SEQN, the patient number.
# # Many constant and binary variables.
# rownames(X) = X[,1]
# X = X[,-1]
# rownames(X)[1:5] # OK
# check = cellWise::checkDataSet(X, fracNA = 0,cleanNAfirst="rows")
# # 2030 rows and 21 columns remain
# X = check$remX
# colnames(X)
# # First variable is WTSA2YR = subsample weights, is not
# # a concentration of a VOC in urine, so remove:
# X = X[, -1]
# boxplot(scale((X))) # lots of skewness
# X = log(X)
# boxplot(scale((X))) # looks better
# # Remove the variables that have lots of measurements 
# # on the lower bound (due to much censoring)
# X = X[, -c(7, 11, 14, 18)]
# save(X,file="./VOC_rawdata.Rdata")

# Now load the VOC data from R file:
load("./VOC_rawdata.Rdata")
dim(X) # 2030  16
round(max(offdiag(abs(cor(X)))),2)
# 0.85 # so there are no "duplicate" variables
hist(offdiag(abs(cor(X))))
# some large correlations

# Load the age data from an R file:
load("./ageData.Rdata")
# These are 2 columns from the much larger dataset
# DEMO_I from NHANES, which contains demographic data.
# Match to individuals in X:
ageData = ageData[match(as.numeric(rownames(X)), ageData[,1]), ]
dim(ageData) # 2030 2 
colnames(ageData)
# Variable 2 is RIDAGEYR, the person's age in years.
summary(ageData[,2]) # age
# Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
# 3.00   10.00   29.00   33.16   54.00   80.00 

# Select children of age 10 or younger:
selectInds = which((ageData[,2] <= 10))
X = X[selectInds,]
dim(X) # 512 16
boxplot(scale((X))) # looks fine

# Run the Detection Imputation (DI) algorithm:
tic = Sys.time()
DI.out = DI(X, quant = 0.99)
toc = Sys.time(); toc - tic # 16 secs
dim(DI.out$allSigmas)[1] # 8 
# 8 covariance matrices have been computed. These are
# the initial one as well as those in each step.
# Therefore the method converged in 7 steps.

# Run cellFlagger with these estimates:
tic = Sys.time()
cellFlagger.out = cellFlagger(X, mu = DI.out$center, 
                              Sigma = DI.out$cov,
                              quant = 0.99)                      
toc = Sys.time(); toc - tic # 2.5 secs

W    = cellFlagger.out$W # indices 1 in W denote cellwise outliers
Zres = cellFlagger.out$Zres

# Draw cellmap:
pdf("VOCs_20_cellmap.pdf", height = 6)
rowsToShow = 1:20
Xtemp = X[rowsToShow, ]
Rtemp = Zres[rowsToShow, ]
Wtemp = W[rowsToShow, ]           
cellMap2(Xtemp, Rtemp, 
         indcells = which(Wtemp == 1), 
         columnlabels = colnames(Xtemp), 
         rowlabels = 1:20,
         mTitle = "VOCs in children",
         rowtitle = "first 20 children",
         columntitle = "volatile components",
         sizetitles = 2,
         drawCircles = F)
rm(rowsToShow,Xtemp,Rtemp,Wtemp)
dev.off()

# Variable 8 has a substantial number of red cells.
# Its total number of outlying cells:
sum(W[,8])/nrow(X) # 0.1132812
# Variable 8 has 11% of outlying cells.
# Since quant = 0.99 these are the cells with absolute
# residual above sqrt(qchisq(p=0.99,df=1)) = 2.575829 .
# Variable 8 is "URXCYM":  
#   N-Acetyl-S-(2-cyanoethyl)-L-cysteine (ng/mL)
# this is a well-known biomarker for exposure to tobacco 
# smoke. see e.g.
# https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0210104
# Adults who smoke usually have high values.

# How many URXCYM values in this set are marginally outlying?
# If we would use univariate outlier detection, few of 
# the URXCYM values in this set would be considered suspicious:
meds = apply(X,2,FUN="median")
mads = apply(X,2,FUN="mad")
Z = scale(X,center=meds,scale=mads)
cutoff = sqrt(qchisq(p=0.99,df=1)); cutoff # 2.5758

cellInd = which(abs(Zres[,8]) > cutoff)
length(cellInd)/nrow(X) # 0.1132812 # over 11%
marginalInd = which(abs(Z[,8]) > cutoff)
length(marginalInd)/nrow(X) # 0.01953125 # under 2%
# Even for perfectly gaussian data this would already be 1%.

pdf("ZresVersusZ.pdf",width=5.4,height=5.4) # sizes in inches
plot(Z[,8], Zres[,8], xlab = "",ylab = "",main = "",pch = 16, 
     col = "black", xlim=c(-3,5)) # a bit of left censoring in Z
title(main="log(URXCYM) in children aged 10 or younger",
      line=1) # , cex.lab=1.2, family="Calibri Light")
title(ylab="standardized cellwise residuals", line=2.3)
title(xlab="robustly standardized marginal values", line=2.3)
abline(h=cutoff, col="red")
abline(h=-cutoff, col="red")
abline(v=cutoff, col="red")
abline(v=-cutoff, col="red")
dev.off()
# A lot of the outlying residuals occur at innocent
# looking marginal values.
# These persons' URXCYM is high relative to the other 
# compounds (variables) in the same person.


# Link this with smoking
########################

load("SmokingSelf.Rdata")     
# This is the SMQ_I questionnaire data from NHANES.
# It describes the smoking status/habits of the person.
dim(SMQ_I) # 7001  42
# Match to individuals in X:
selfSmoking = SMQ_I[match(as.numeric(rownames(X)), SMQ_I[,1]), ]
dim(selfSmoking) # 512 42

# Do any of these children smoke? No:
colnames(SMQ_I)
# Variable 21 is SMQ621: cigarettes smoked in life.
# Category 1 is "none".
which(as.numeric(selfSmoking[, 21]) > 1) # integer(0)

# Smoking behavior of family members:
load("SmokingFamily.Rdata") 
# This is the SMQFAM_I questionnaire data from NHANES.
# It describes the smoking habits of household members.
dim(SMQFAM_I) # 9971 4
colnames(SMQFAM_I)
# Variable 2 is SMD460: How many people who live here 
#                       smoke tobacco?
# Variable 3 is SMD470: How many people smoke inside
#                       this home?
# Match to individuals in X:
familySmoking = SMQFAM_I[match(as.numeric(rownames(X)), SMQFAM_I[,1]), ]
dim(familySmoking) # 512 4
remove(SMQFAM_I,SMQ_I) # save space

# Look at the residuals for the children who 
# live together with people who smoke.
# We consider 4 categories:

# children without smokers in their family:
nonsmokers = which(familySmoking[, 2] == 0) 
length(nonsmokers) # 340
# at least one adult smokes, but not in the home:
noneInHome = which((familySmoking[, 2] > 0) &
                      (familySmoking[, 3] == 0))
length(noneInHome) # 109
# children with 1 person smoking in their home:
oneInHome = which(familySmoking[, 3] == 1)
length(oneInHome) # 33
# children with 2 people smoking in their home:
twoInHome = which(familySmoking[, 3] == 2) 
length(twoInHome) # 11
rm(selfSmoking,familySmoking)

length(which(Zres[nonsmokers,8] > 0))/length(nonsmokers) # 0.04705882
length(which(Zres[noneInHome,8] > 0))/length(noneInHome) # 0.1192661
length(which(Zres[oneInHome,8] > 0))/length(oneInHome) # 0.3939394
length(which(Zres[twoInHome,8] > 0))/length(twoInHome) # 0.7272727

# So 39% of the children living in a house with one smoker 
# have suspiciously high levels for this biomarker.
# 73% of the children living in a house with two smokers
# have suspiciously high levels for this biomarker.

cutCellmap(oneInHome, X, Zres, W)
cutCellmap(twoInHome, X, Zres, W)
# For one or more smokers in the house:
smokeInHome = c(oneInHome,twoInHome)
length(smokeInHome) # 44
cutCellmap(smokeInHome, Z, Zres, W)
# In all of these cellmaps the variable URXCYM stands out!

# If we would use a univariate detection bound, many of these values
# wouldn't be considered suspicious:

length(which(Z[nonsmokers] > cutoff))/length(nonsmokers) # 0.020588
length(which(Z[noneInHome] > cutoff))/length(noneInHome) # 0.027522
length(which(Z[oneInHome] > cutoff))/length(oneInHome) # 0
length(which(Z[twoInHome] > cutoff))/length(twoInHome) # 0
# Here the fractions are not even increasing with the number of smokers.

plotdata = matrix(c(length(which(Zres[nonsmokers,8] > 0))/length(nonsmokers),
                    length(which(Zres[noneInHome,8] > 0))/length(noneInHome), 
                    length(which(Zres[oneInHome,8] > 0))/length(oneInHome), 
                    length(which(Zres[twoInHome,8] > 0))/length(twoInHome),
                    length(which(Z[nonsmokers] > cutoff))/length(nonsmokers),
                    length(which(Z[noneInHome] > cutoff))/length(noneInHome),
                    length(which(Z[oneInHome] > cutoff))/length(oneInHome),
                    length(which(Z[twoInHome] > cutoff))/length(twoInHome)),
                  nrow = 2, byrow = TRUE)


pdf("cellwise_marginal.pdf",width=5.4,height=5.4)
matplot(1:4, t(plotdata), type = "b", pch = 16, lwd = 3,
        cex = 2, xlab = "", xaxt = "n", ylab = "", yaxt = "n", 
        ylim = c(0, 0.8), col = c("blue", "red"), lty = 1)
axis(side = 1, labels = c("none", "0 in home", "1 in home", "2 in home"),
     at = 1:4, cex.axis = 1.3)
axis(side = 2, labels = seq(0, 100, by = 20),
     at = seq(0, 1, by = 0.2), cex.axis = 1.3)
legend("topleft", fill = c("blue", "red"),
       legend = c("cell residuals","marginal values"), cex = 1.3)
title(main="Effect of smokers on elevated URXCYM in children",
      line=1.2) 
title(ylab="% of children with elevated URXCYM",cex.lab=1.3, line=2.3)
title(xlab="smoking adults in household",cex.lab=1.3, line=2.3)
dev.off()

