This markdown doc contains code used to analyse the association of APOE alleles with dementia in Genes & Health. Ben Jacobs 14-08-25 b.jacobs@qmul.ac.uk
cd /home/ivm/apoe/scratch
plink2 --vcf /genesandhealth/library-red/genesandhealth/GSAv3EAMD/Oct2023_51k_TOPMED-r2_Imputation_b38/unfiltered_merged_vcfs/chr19.dose.merged.vcf.gz \
--chr 19 \
--from-bp 44908684 \
--to-bp 44908822 \
--make-pfile \
--out apoe_region
cat /home/ivm/apoe/scratch/apoe_region.pvar | grep -v -E "##"
echo "chr19:44908684:T:C" > snps_to_keep
echo "chr19:44908822:C:T" >> snps_to_keep
plink2 --pfile apoe_region \
--extract snps_to_keep \
--export haps \
--out apoe_haplotypes
library(tidyverse)
# read in haps
apoe_haps = read_table("/home/ivm/apoe/scratch/apoe_haplotypes.haps",col_names = F)
apoe_samples = read_table("/home/ivm//apoe/scratch/apoe_haplotypes.sample",col_names = T) %>%
filter(ID_2 != 0)
colnames(apoe_haps)[c(1:5)] = c("CHR","SNP","POS","REF","ALT")
colnames = list()
for(i in c(1:length(apoe_samples$ID_2))){
colnames[[length(colnames)+1]] = paste0(apoe_samples$ID_2[i],"_mat")
colnames[[length(colnames)+1]] = paste0(apoe_samples$ID_2[i],"_pat")
}
colnames(apoe_haps)[c(6:ncol(apoe_haps))] = unlist(colnames)
# spread long
apoe_haps = apoe_haps %>%
pivot_longer(cols = c(6:ncol(apoe_haps))) %>%
pivot_wider(id_cols = name, values_from = value, names_from = SNP)
apoe_haps %>%
dplyr::count(`chr19:44908684:T:C`) %>%
mutate(prop = n/sum(n))
# define haplotypes
apoe_haps = apoe_haps %>%
mutate(apoe_status = case_when(
`chr19:44908684:T:C` == 0 & `chr19:44908822:C:T` == 1 ~ "APOE4",
`chr19:44908684:T:C` == 1 & `chr19:44908822:C:T` == 1 ~ "APOE3",
`chr19:44908684:T:C` == 1 & `chr19:44908822:C:T` == 0 ~ "APOE2",
`chr19:44908684:T:C` == 0 & `chr19:44908822:C:T` == 0 ~ "APOE1"
)) %>%
separate(name,sep = "_", into = c("IID","FID","other","chrom")) %>%
dplyr::select(-FID,-other) %>%
pivot_wider(id_cols = IID,values_from = apoe_status,names_from = chrom) %>%
mutate(apoe4_doses =
case_when(mat == "APOE4" & pat == "APOE4" ~ "E4/E4",
mat == "APOE3" & pat == "APOE3" ~ "E3/E3",
mat == "APOE2" & pat == "APOE2" ~ "E2/E2",
mat == "APOE1" & pat == "APOE1" ~ "E1/E1",
mat == "APOE4" & pat == "APOE3" ~ "E4/E3",
mat == "APOE3" & pat == "APOE4" ~ "E4/E3",
mat == "APOE4" & pat == "APOE2" ~ "E4/E2",
mat == "APOE2" & pat == "APOE4" ~ "E4/E2",
mat == "APOE4" & pat == "APOE1" ~ "E4/E1",
mat == "APOE1" & pat == "APOE4" ~ "E4/E1",
mat == "APOE3" & pat == "APOE2" ~ "E3/E2",
mat == "APOE2" & pat == "APOE3" ~ "E3/E2",
mat == "APOE3" & pat == "APOE1" ~ "E3/E1",
mat == "APOE1" & pat == "APOE3" ~ "E3/E1",
mat == "APOE2" & pat == "APOE1" ~ "E2/E1",
mat == "APOE1" & pat == "APOE2" ~ "E2/E1"
))
# get just e4 dose
apoe_haps = apoe_haps %>%
mutate(apoe4_alleles = case_when(
apoe4_doses == "E4/E4" ~ 2,
apoe4_doses == "E4/E2" ~ 1,
apoe4_doses == "E4/E3" ~ 1,
apoe4_doses == "E4/E1" ~ 1,
.default = 0))
apoe_haps = apoe_haps %>%
mutate(apoe3_alleles = case_when(
mat == "APOE3" & pat == "APOE3" ~ 2,
mat == "APOE3" & pat != "APOE3" ~ 1,
mat != "APOE3" & pat == "APOE3" ~ 1,
mat != "APOE3" & pat != "APOE3" ~ 0,
.default = 0))
apoe_haps = apoe_haps %>%
mutate(apoe2_alleles = case_when(
mat == "APOE2" & pat == "APOE2" ~ 2,
mat == "APOE2" & pat != "APOE2" ~ 1,
mat != "APOE2" & pat == "APOE2" ~ 1,
mat != "APOE2" & pat != "APOE2" ~ 0,
.default = 0))
# binomial tests
binom.test(x = 10016,n = (50490+614),p = 0.288,alternative = "less")
binom.test(x = 49280,n = 50490,p = 0.944)
# AF heatmap
props = apoe_haps %>%
dplyr::count(apoe4_doses) %>%
mutate(pct_gh = 100*n/sum(n)) %>%
dplyr::select(-n)
props_ukb = data.frame(
apoe4_doses = c("E2/E2", "E3/E2", "E4/E2", "E3/E3", "E4/E3", "E4/E4"),
pct_ukb = c(0.6, 12.3, 2.6, 58.2, 23.9, 2.4)
)
props = props %>% left_join(props_ukb,by="apoe4_doses")
colnames(props) = c("genotype","Genes & Health (SAS)","UK Biobank (EUR)")
props = props %>%
pivot_longer(cols = c(2,3))
props = props %>% arrange(genotype)
props$genotype = factor(props$genotype,levels = unique(props$genotype))
props = props %>% group_by(name) %>% mutate(cumsum = cumsum(value)) %>%
mutate(pos = 100-cumsum)
props$pos2 = c(100,100,props$pos[-c(11,12)])
props = props %>%
ungroup %>%
mutate(midpoint = (pos + pos2 ) / 2)
png("/home/ivm/apoe/export/afs.png",res=900,units="in",width=6,height=6)
ggplot(props,aes(name,value,fill=genotype))+
geom_col(color="black")+
scale_fill_brewer(palette="Set1")+
theme_bw()+
labs(fill="APOE allele",x="Cohort",y="Percentage of cohort (%)")+
geom_text(data = props %>% mutate(label = ifelse(value < 5," ",paste0(round(value,1),"%"))),
mapping = aes(label = label,y=midpoint))
dev.off()
# get phenotype data
dementia = read_csv("/genesandhealth/library-red/genesandhealth/phenotypes_curated/version010_2025_04/BI_PY/outputs/custom_phenotypes/individual_trait_files/2025_05_Dementia_summary_report.csv")
ad = read_csv("/genesandhealth/library-red/genesandhealth/phenotypes_curated/version010_2025_04/BI_PY/outputs/icd10/individual_trait_files/3_digit_icd/2025_05_G30_summary_report.csv")
dementia_nos = read_csv("/genesandhealth/library-red/genesandhealth/phenotypes_curated/version010_2025_04/BI_PY/outputs/icd10/individual_trait_files/3_digit_icd/2025_05_F03_summary_report.csv")
vad = read_csv("/genesandhealth/library-red/genesandhealth/phenotypes_curated/version010_2025_04/BI_PY/outputs/icd10/individual_trait_files/3_digit_icd/2025_05_F01_summary_report.csv")
# combine all
bind_rows(dementia,ad,dementia_nos,vad) %>% distinct(nhs_number)
link_file = read_csv("/genesandhealth/library-red/genesandhealth/2025_02_10__MegaLinkage_forTRE.csv") %>%
dplyr::rename("IID" = OrageneID,"nhs_number"=`pseudonhs_2024-07-10`) %>%
dplyr::select(IID,nhs_number)
dob = read_csv("/genesandhealth/library-red/genesandhealth/phenotypes_rawdata/QMUL__Stage1Questionnaire/2025_04_25__S1QSTredacted.csv") %>%
mutate(s1qst_dob = as.Date(paste0("01-",`S1QST_MM-YYYY_ofBirth`),format="%d-%m-%Y")) %>%
mutate(IID = as.character(S1QST_Oragene_ID)) %>%
dplyr::select(IID,s1qst_dob)
dementia = dementia %>%
inner_join(link_file,by="nhs_number")
# take earliest date
dementia = dementia %>% group_by(nhs_number) %>% slice_min(date,n=1,with_ties = F) %>% ungroup()
# join haplotypes with dementia status
apoe_haps = apoe_haps %>%
mutate(IID = as.numeric(IID)) %>%
left_join(dementia,by="IID") %>%
mutate(dementia_status = ifelse(is.na(age_at_event),"Control","Case"))
# join with dob
apoe_haps = apoe_haps %>%
inner_join(dob %>% mutate(IID = as.numeric(IID)),by="IID")
# add covars
cov = read_tsv("/genesandhealth/library-red/genesandhealth/GSAv3EAMD/Oct2023_51k_TOPMED-r2_Imputation_b38/GNH.51170.noEthnicOutliers.covariates.20PCs.tab")
apoe_haps = apoe_haps %>%
mutate(IID = as.character(IID)) %>%
inner_join(cov %>%
mutate(IID = as.character(OrageneID)),by="IID")
# add other phenotypes
## vad
apoe_haps = apoe_haps %>%
left_join(vad %>%
inner_join(link_file,by="nhs_number") %>%
dplyr::rename("age_at_event_vad" = age_at_event) %>%
mutate(IID = as.character(IID)),by="IID") %>%
mutate(vad_status = ifelse(is.na(age_at_event_vad),"Control","Case"))
## AD
apoe_haps = apoe_haps %>%
left_join(ad %>%
inner_join(link_file,by="nhs_number") %>%
dplyr::rename("age_at_event_ad" = age_at_event) %>%
mutate(IID = as.character(IID)),by="IID") %>%
mutate(ad_status = ifelse(is.na(age_at_event_ad),"Control","Case"))
## NOS
apoe_haps = apoe_haps %>%
left_join(dementia_nos %>%
inner_join(link_file,by="nhs_number") %>%
dplyr::rename("age_at_event_nos" = age_at_event) %>%
mutate(IID = as.character(IID)),by="IID") %>%
mutate(dementia_nos__status = ifelse(is.na(age_at_event_nos),"Control","Case"))
apoe_haps %>% glimpse()
# survival analysis
library(survival)
library(survminer)
# define event
apoe_haps = apoe_haps %>%
mutate(age_at_data_extract = as.numeric(as.Date("2024-01-01",format="%Y-%m-%d") - s1qst_dob)/365.25) %>%
mutate(status = ifelse(dementia_status == "Control",0,1)) %>%
mutate(survtime = ifelse(dementia_status == "Control",age_at_data_extract-16,age_at_event-16))
# demographics
apoe_haps
hist(apoe_haps$AgeAtRecruitment)
summary(apoe_haps$AgeAtRecruitment)
# exclude ambiguous
apoe_haps = apoe_haps %>% filter(pop.GSA.51k!="Ambiguous")
nrow(apoe_haps)
apoe_haps %>% nrow()
apoe_haps %>% filter(AgeAtRecruitment<18)
apoe_haps %>% dplyr::count(dementia_status)
apoe_haps %>% dplyr::count(apoe4_alleles) %>% mutate(prop = n/sum(n))
apoe_haps %>% dplyr::count(apoe3_alleles) %>% mutate(prop = n/sum(n))
# get AFs
nchroms = nrow(apoe_haps)*2
apoe_haps %>% dplyr::count(apoe2_alleles) %>% mutate(n_dose = n*apoe2_alleles) %>% mutate(af = sum(n_dose)/ nchroms)
apoe_haps %>% dplyr::count(apoe3_alleles) %>% mutate(n_dose = n*apoe3_alleles) %>% mutate(af = sum(n_dose)/ nchroms)
apoe_haps %>% dplyr::count(apoe4_alleles) %>% mutate(n_dose = n*apoe4_alleles) %>% mutate(af = sum(n_dose)/nchroms)
# stratified
apoe_haps %>% group_by(dementia_status) %>% dplyr::count(apoe2_alleles) %>% mutate(n_dose = n*apoe2_alleles) %>% mutate(af = sum(n_dose)/ (2*sum(n)))
apoe_haps %>% group_by(dementia_status) %>% dplyr::count(apoe3_alleles) %>% mutate(n_dose = n*apoe3_alleles) %>% mutate(af = sum(n_dose)/ (2*sum(n)))
apoe_haps %>% group_by(dementia_status) %>% dplyr::count(apoe4_alleles) %>% mutate(n_dose = n*apoe4_alleles) %>% mutate(af = sum(n_dose)/ (2*sum(n)))
# demographics
apoe_haps %>% summarise_at(vars(AgeAtRecruitment),.funs = c("median","IQR"))
apoe_haps %>% group_by(dementia_status) %>% summarise_at(vars(AgeAtRecruitment),.funs = c("median","IQR"))
apoe_haps %>% group_by(dementia_status) %>% dplyr::count(S1QST_Gender) %>% mutate(pct = n/sum(n)*100)
apoe_haps %>% dplyr::count(S1QST_Gender) %>% mutate(pct = n/sum(n)*100)
apoe_haps %>% dplyr::count(apoe4_doses) %>% mutate(pct = n/sum(n)*100) %>%
mutate(lab = paste0(n," ",round(pct,1),"%"))
apoe_haps %>% group_by(dementia_status) %>% dplyr::count(apoe4_doses) %>% mutate(pct = n/sum(n)*100) %>%
mutate(lab = paste0(n," ",round(pct,1),"%")) %>%
pivot_wider(id_cols = apoe4_doses,values_from = lab,names_from = dementia_status)
apoe_haps %>% group_by(dementia_status) %>% dplyr::count(apoe4_alleles) %>% mutate(pct = n/sum(n)*100) %>%
mutate(lab = paste0(n," ",round(pct,1),"%")) %>%
pivot_wider(id_cols = apoe4_alleles,values_from = lab,names_from = dementia_status)
apoe_haps %>% group_by(dementia_status) %>% dplyr::count(pop.GSA.51k) %>% mutate(pct = n/sum(n)*100) %>% arrange(pct)
apoe_haps %>% filter(dementia_status=="Case") %>% summarise_at(vars(age_at_event),.funs = c("median","IQR"),na.rm=T)
apoe_haps %>% group_by(pop.GSA.51k) %>%
dplyr::count(apoe4_alleles) %>%
mutate(n_alleles = apoe4_alleles*n) %>%
summarise(total_alleles = sum(n_alleles),total_chroms = 2*sum(n)) %>%
mutate(af = total_alleles / total_chroms)
cases = apoe_haps %>% filter(dementia_status=="Case") %>% mutate(case_type = ifelse(age_at_event > AgeAtRecruitment,"Incident","Prevalent"))
cases %>%
dplyr::count(case_type) %>%
mutate(total = sum(n),prop = n/total)
# unadjusted
glm(data = apoe_haps %>%
mutate(dementia_status = ifelse(dementia_status == "Case",1,0)),
dementia_status ~ apoe4_doses,
family=binomial(link="logit")) %>%
summary()
# adjusted
# set ref
apoe_haps$apoe4_doses = relevel(factor(apoe_haps$apoe4_doses),ref="E3/E3")
model = glm(data = apoe_haps %>%
mutate(dementia_status = ifelse(dementia_status == "Case",1,0)),
dementia_status ~ S1QST_Gender + AgeAtRecruitment + pop.GSA.51k +
PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 +
apoe4_doses,
family=binomial(link="logit")) %>%
broom::tidy()
model
coefs = model %>%
filter(grepl("apoe4",term)) %>%
mutate(term = str_remove_all(term,"apoe4_doses")) %>%
bind_rows(
data.frame(
term = "E3/E3",estimate=0,std.error =0.,statistic=NA,p.value=1
)
) %>%
mutate(or = exp(estimate)) %>%
mutate(lowerci = exp(estimate-1.96*std.error)) %>%
mutate(upperci = exp(estimate+1.96*std.error))
coefs
model = glm(data = apoe_haps %>%
mutate(dementia_status = ifelse(dementia_status == "Case",1,0)),
dementia_status ~ S1QST_Gender + AgeAtRecruitment + pop.GSA.51k +
PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 +
apoe4_doses,
family=binomial(link="logit")) %>%
broom::tidy()
# repeat using e2/e2 as ref
# set ref
apoe_haps$apoe4_doses = relevel(factor(apoe_haps$apoe4_doses),ref="E2/E2")
model = glm(data = apoe_haps %>%
mutate(dementia_status = ifelse(dementia_status == "Case",1,0)),
dementia_status ~ S1QST_Gender + AgeAtRecruitment + pop.GSA.51k +
PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 +
apoe4_doses,
family=binomial(link="logit")) %>%
broom::tidy()
model
coefs = model %>%
filter(grepl("apoe4",term)) %>%
mutate(term = str_remove_all(term,"apoe4_doses")) %>%
bind_rows(
data.frame(
term = "E2/E2",estimate=0,std.error =0.,statistic=NA,p.value=1
)
) %>%
mutate(or = exp(estimate)) %>%
mutate(lowerci = exp(estimate-1.96*std.error)) %>%
mutate(upperci = exp(estimate+1.96*std.error))
coefs
apoe_haps$apoe4_doses = relevel(factor(apoe_haps$apoe4_doses),ref="E3/E3")
# repeat for apoe4 alone
glm(data = apoe_haps %>%
mutate(dementia_status = ifelse(dementia_status == "Case",1,0)) %>%
mutate(apoe4_carrier = ifelse(apoe4_alleles>0,"yes","no")),
dementia_status ~ S1QST_Gender + AgeAtRecruitment + pop.GSA.51k +
PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 +
apoe4_carrier,
family=binomial(link="logit")) %>%
broom::tidy() %>%
filter(term == "apoe4_carrieryes") %>%
mutate(or = exp(estimate))
glm(data = apoe_haps %>%
mutate(dementia_status = ifelse(dementia_status == "Case",1,0)) %>%
mutate(apoe4_carrier = ifelse(apoe4_alleles>0,"yes","no")),
dementia_status ~ S1QST_Gender + AgeAtRecruitment + pop.GSA.51k +
PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 +
apoe4_carrier,
family=binomial(link="log")) %>%
broom::tidy() %>%
filter(term == "apoe4_carrieryes") %>%
mutate(rr = exp(estimate))
# PAF for apoe4 specifically
## prop exposed to apoe4
props = apoe_haps %>% dplyr::count(apoe4_alleles != 0) %>% mutate(prop = n/sum(n))
prop_exp = props[props$`apoe4_alleles != 0`==T,]$prop
props_by_case_status = apoe_haps %>% group_by(dementia_status) %>%
dplyr::count(apoe4_alleles != 0) %>% mutate(prop = n/sum(n)) %>%
filter(dementia_status=="Case")
prop_exp_cases = props_by_case_status[props_by_case_status$`apoe4_alleles != 0`==T,]$prop
## rr given apoe4
tbl = apoe_haps %>%
group_by(apoe4_alleles != 0) %>%
dplyr::count(dementia_status)%>%
mutate(prop = n/sum(n)) %>%
filter(dementia_status=="Case")
rr = tbl[tbl$`apoe4_alleles != 0`==T,]$prop / tbl[tbl$`apoe4_alleles != 0`==F,]$prop
## manual calc
rr_manual = 1.44
prev_manual = 0.195562
paf_levin = ( prev_manual * (rr_manual - 1) ) / ( prev_manual * (rr_manual - 1) + 1)
## function to compute PAFs dichotomously
paf_calc_dichotomous = function(beta,se,prop_exposure,prop_exposure_cases){
# Levin dichotomous
rr = exp(beta)
lower_rr = exp(beta-1.96*se)
upper_rr = exp(beta+1.96*se)
paf_levin = ( prop_exposure * (rr - 1) ) / ( prop_exposure * (rr - 1) + 1)
paf_levin_lower = ( prop_exposure * (lower_rr - 1) ) / ( prop_exposure * (lower_rr - 1) + 1)
paf_levin_upper = ( prop_exposure * (upper_rr - 1) ) / ( prop_exposure * (upper_rr - 1) + 1)
# Miettenin dichotomous
paf_miettenin = ( (rr-1) / rr) * prop_exposure_cases
paf_miettenin_lower = ( (lower_rr-1) / lower_rr) * prop_exposure_cases
paf_miettenin_upper = ( (upper_rr-1) / upper_rr) * prop_exposure_cases
data.frame(
"PAF" = c(paf_levin,paf_miettenin),
"Lower_CI" = c(paf_levin_lower,paf_miettenin_lower),
"Upper_CI" = c(paf_levin_upper,paf_miettenin_upper),
"Method" = c("Levin","Miettenin")
)
}
res = paf_calc_dichotomous(0.48,0.101,prop_exp,prop_exp_cases) %>%
mutate(exposure = "APOE4 (dominant)")
# Repeat for e3/e4
## prop exposed to apoe4
apoe_haps = apoe_haps %>%
mutate(apoe3_or_4_carrier = ifelse(mat == "APOE3" | pat =="APOE3" | mat == "APOE4" | pat =="APOE4","yes","no"))
props = apoe_haps %>% dplyr::count(apoe3_or_4_carrier) %>% mutate(prop = n/sum(n))
prop_exp = props[props$apoe3_or_4_carrier=="yes",]$prop
props_by_case_status = apoe_haps %>% group_by(dementia_status) %>%
dplyr::count(apoe3_or_4_carrier) %>% mutate(prop = n/sum(n)) %>%
filter(dementia_status=="Case")
prop_exp_cases = props_by_case_status[props_by_case_status$apoe3_or_4_carrier=="yes",]$prop
# repeat for apoe4 or apoe3 alone
glm(data = apoe_haps %>%
mutate(dementia_status = ifelse(dementia_status == "Case",1,0)),
dementia_status ~ S1QST_Gender + AgeAtRecruitment + pop.GSA.51k +
PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 +
apoe3_or_4_carrier,
family=binomial(link="logit")) %>%
broom::tidy() %>%
filter(term == "apoe3_or_4_carrieryes") %>%
mutate(or = exp(estimate))
## rr given apoe4 or apoe3
tbl = apoe_haps %>%
group_by(apoe3_or_4_carrier) %>%
dplyr::count(dementia_status)%>%
mutate(prop = n/sum(n)) %>%
filter(dementia_status=="Case")
rr = tbl[tbl$apoe3_or_4_carrier=="yes",]$prop / tbl[tbl$apoe3_or_4_carrier=="no",]$prop
res2 = paf_calc_dichotomous(0.983,1.07,prop_exp,prop_exp_cases) %>%
mutate(exposure = "APOE3 or 4 (dominant)")
# stratified method
## just apoe4
case_counts = apoe_haps %>%
group_by(apoe4_alleles) %>%
dplyr::count(dementia_status) %>%
filter(dementia_status == "Case") %>%
ungroup() %>%
mutate(case_fraction = n/sum(n))%>%
mutate(apoe4_alleles = as.character(apoe4_alleles))
geno_prev = apoe_haps %>%
dplyr::count(apoe4_alleles) %>%
mutate(prev = n/sum(n))%>%
mutate(apoe4_alleles = as.character(apoe4_alleles))
pafs = glm(data = apoe_haps %>%
mutate(apoe4_alleles = as.character(apoe4_alleles)) %>%
mutate(dementia_status = ifelse(dementia_status == "Case",1,0)),
dementia_status ~ S1QST_Gender + AgeAtRecruitment + pop.GSA.51k +
PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 +
apoe4_alleles,
family=binomial(link="logit")) %>%
broom::tidy() %>%
filter(grepl("apoe",term)) %>%
mutate(apoe4_alleles = str_remove_all(term,"apoe4_alleles")) %>%
mutate(or = exp(estimate)) %>%
mutate(lower = exp(estimate - 1.96*std.error)) %>%
mutate(upper = exp(estimate + 1.96*std.error)) %>%
left_join(case_counts,by="apoe4_alleles") %>%
left_join(geno_prev %>% mutate(total = n),by="apoe4_alleles") %>%
mutate(paf_est = (or-1)/or*case_fraction) %>%
mutate(paf_est_lower = (lower-1)/lower*case_fraction) %>%
mutate(paf_est_upper = (upper-1)/upper*case_fraction) %>%
dplyr::select(apoe4_alleles,contains("paf"))
pafs
pafs %>% dplyr::select(-1) %>% colSums()
## apoe3 or apoe4
case_counts = apoe_haps %>%
group_by(apoe4_doses) %>%
dplyr::count(dementia_status) %>%
filter(dementia_status == "Case") %>%
ungroup() %>%
mutate(case_fraction = n/sum(n))
geno_prev = apoe_haps %>%
dplyr::count(apoe4_doses) %>%
mutate(prev = n/sum(n))
apoe_haps$apoe4_doses = relevel(apoe_haps$apoe4_doses,ref="E2/E2")
pafs = glm(data = apoe_haps %>%
mutate(dementia_status = ifelse(dementia_status == "Case",1,0)),
dementia_status ~ S1QST_Gender + AgeAtRecruitment + pop.GSA.51k +
PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 +
apoe4_doses,
family=binomial(link="logit")) %>%
broom::tidy() %>%
filter(grepl("apoe",term)) %>%
mutate(apoe4_doses = str_remove_all(term,"apoe4_doses")) %>%
mutate(or = exp(estimate)) %>%
mutate(lower = exp(estimate - 1.96*std.error)) %>%
mutate(upper = exp(estimate + 1.96*std.error)) %>%
left_join(case_counts,by="apoe4_doses") %>%
left_join(geno_prev %>% mutate(total = n),by="apoe4_doses") %>%
mutate(paf_est = (or-1)/or*case_fraction) %>%
mutate(paf_est_lower = (lower-1)/lower*case_fraction) %>%
mutate(paf_est_upper = (upper-1)/upper*case_fraction) %>%
dplyr::select(apoe4_doses,contains("paf"))
pafs
# get ratios
model = glm(data = apoe_haps %>%
mutate(dementia_status = ifelse(dementia_status == "Case",1,0)),
dementia_status ~ S1QST_Gender + AgeAtRecruitment + pop.GSA.51k +
PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 +
apoe4_doses,
family=binomial(link="logit")) %>%
broom::tidy() %>%
filter(grepl("apoe",term)) %>%
mutate(apoe4_doses = str_remove_all(term,"apoe4_doses")) %>%
mutate(or = exp(estimate)) %>%
mutate(lower = exp(estimate - 1.96*std.error)) %>%
mutate(upper = exp(estimate + 1.96*std.error)) %>%
dplyr::select(apoe4_doses,or,lower,upper)
# ratios
ratio = model[model$apoe4_doses=="E4/E2",]$or/model[model$apoe4_doses=="E3/E2",]$or
upperratio = model[model$apoe4_doses=="E4/E2",]$upper/model[model$apoe4_doses=="E3/E2",]$lower
lowerratio = model[model$apoe4_doses=="E4/E2",]$lower/model[model$apoe4_doses=="E3/E2",]$upper
# e4 calc
e4 = data.frame(
apoe4_doses = "Combined E4",
paf_est = pafs[pafs$apoe4_doses=="E4/E2",]$paf_est + pafs[pafs$apoe4_doses=="E4/E4",]$paf_est + (ratio / (ratio+1) ) * pafs[pafs$apoe4_doses=="E4/E3",]$paf_est,
paf_est_lower = pafs[pafs$apoe4_doses=="E4/E2",]$paf_est_lower + pafs[pafs$apoe4_doses=="E4/E4",]$paf_est_lower + (lowerratio / (lowerratio+1) ) * pafs[pafs$apoe4_doses=="E4/E3",]$paf_est_lower,
paf_est_upper = pafs[pafs$apoe4_doses=="E4/E2",]$paf_est_upper + pafs[pafs$apoe4_doses=="E4/E4",]$paf_est_upper + (upperratio / (upperratio+1) ) * pafs[pafs$apoe4_doses=="E4/E3",]$paf_est_upper
)
# e3 calc
e3 = data.frame(
apoe4_doses = "Combined E3",
paf_est = pafs[pafs$apoe4_doses=="E3/E2",]$paf_est + pafs[pafs$apoe4_doses=="E3/E3",]$paf_est + ( 1 - (ratio / (ratio+1) ) )* pafs[pafs$apoe4_doses=="E4/E3",]$paf_est,
paf_est_lower = pafs[pafs$apoe4_doses=="E3/E2",]$paf_est_lower + pafs[pafs$apoe4_doses=="E3/E3",]$paf_est_lower + (1 - (lowerratio / (lowerratio+1) ) ) * pafs[pafs$apoe4_doses=="E4/E3",]$paf_est_lower,
paf_est_upper = pafs[pafs$apoe4_doses=="E3/E2",]$paf_est_upper + pafs[pafs$apoe4_doses=="E3/E3",]$paf_est_upper + (1 - (upperratio / (upperratio+1) ) )* pafs[pafs$apoe4_doses=="E4/E3",]$paf_est_upper )
data.frame(apoe4_doses = "E3 & E4")
bind_rows(e4,e3,pafs)
# crude
pafs = pafs %>% dplyr::select(-1) %>% colSums() %>%
t() %>%
data.frame() %>%
mutate(apoe4_doses = "Combined E3 & E4") %>%
bind_rows(pafs,e3,e4)
pafs$apoe4_doses = factor(pafs$apoe4_doses,
levels = c("Combined E3 & E4","Combined E3","Combined E4","E3/E2","E3/E3","E4/E2","E4/E3","E4/E4"
))
pafs = pafs %>% mutate(col = ifelse(grepl("Combined",apoe4_doses),"Combined","Other"))
ggplot(pafs,aes(apoe4_doses,100*paf_est,col=col,label=paste0(round(100*paf_est,1),"%")))+
geom_point(size=3)+
scale_color_brewer(palette="Dark2")+
geom_errorbar(mapping = aes(x = apoe4_doses,ymin=100*paf_est,ymax=100*paf_est_upper),width=0.01)+
theme_bw()+
theme(legend.position = "none")+
labs(y="PAF (%)",x="APOE genotype")+
ggrepel::geom_text_repel(nudge_x = 0.1)
# repeat with E2/E2 and E2/E3 combined reference
apoe_haps = apoe_haps %>% mutate(apoe_allele_for_paf = ifelse(
apoe4_doses %in% c("E2/E2","E3/E2"), "reference",as.character(apoe4_doses)
))
case_counts = apoe_haps %>%
group_by(apoe_allele_for_paf) %>%
dplyr::count(dementia_status) %>%
filter(dementia_status == "Case") %>%
ungroup() %>%
mutate(case_fraction = n/sum(n))
geno_prev = apoe_haps %>%
dplyr::count(apoe_allele_for_paf) %>%
mutate(prev = n/sum(n))
apoe_haps$apoe_allele_for_paf = relevel(factor(apoe_haps$apoe_allele_for_paf),ref="reference")
pafs = glm(data = apoe_haps %>%
mutate(dementia_status = ifelse(dementia_status == "Case",1,0)),
dementia_status ~ S1QST_Gender + AgeAtRecruitment + pop.GSA.51k +
PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 +
apoe_allele_for_paf,
family=binomial(link="logit")) %>%
broom::tidy() %>%
filter(grepl("apoe",term) | term=="reference") %>%
mutate(apoe_allele_for_paf = str_remove_all(term,"apoe_allele_for_paf")) %>%
mutate(or = exp(estimate)) %>%
mutate(lower = exp(estimate - 1.96*std.error)) %>%
mutate(upper = exp(estimate + 1.96*std.error)) %>%
left_join(case_counts,by="apoe_allele_for_paf") %>%
left_join(geno_prev %>% mutate(total = n),by="apoe_allele_for_paf") %>%
mutate(paf_est = (or-1)/or*case_fraction) %>%
mutate(paf_est_lower = (lower-1)/lower*case_fraction) %>%
mutate(paf_est_upper = (upper-1)/upper*case_fraction) %>%
dplyr::select(apoe_allele_for_paf,contains("paf"))
pafs = pafs %>% bind_rows(
pafs %>% dplyr::select(-1) %>% colSums())
pafs$apoe_allele_for_paf[5]="Combined"
ggplot(pafs,aes(apoe_allele_for_paf,
100*paf_est,
col=apoe_allele_for_paf!="Combined",
label = paste0(round(paf_est,3)*100,"%")))+
geom_point(show.legend = F)+
geom_errorbar(mapping = aes(x = apoe_allele_for_paf,ymin=100*paf_est_lower,ymax=100*paf_est_upper),width=0.01,show.legend = F)+
theme_bw()+
labs(y="PAF (%)",x="APOE genotype")+
scale_color_brewer(palette="Set1")+
theme(panel.grid = element_blank())+
ggrepel::geom_text_repel(show.legend = F)
pafs
# repeat for just E4
# survival curves
km_fit = survfit(
Surv(survtime,status) ~ apoe4_alleles,
data = apoe_haps
)
p0 = ggsurvplot(km_fit,
fun = "pct",
palette = "Set1",conf.int = F,pval = T,
ylab = "Probability of dementia-free survival",
xlab="Age (years)",
legend.labs = c("APOE E4 -/-","APOE E4 +/-","APOE E4 +/+"),
legend=c(0.3,0.7)
)
p0 = p0$plot+
scale_x_continuous(labels = function(x) x + 16,breaks = c(12,22,32,42,52,62,72))+
geom_hline(yintercept=50,linetype="dashed",alpha=0.3)
p0
# repeat for other alleles
# survival curves
km_fit = survfit(
Surv(survtime,status) ~ apoe4_doses,
data = apoe_haps
)
p99 = ggsurvplot(km_fit,
fun = "pct",
palette = "Set1",conf.int = F,pval = T,
ylab = "Probability of dementia-free survival",
xlab="Age (years)",
legend=c(0.3,0.7)
)
p99 = p99$plot+
scale_x_continuous(labels = function(x) x + 16,breaks = c(12,22,32,42,52,62,72))+
geom_hline(yintercept=50,linetype="dashed",alpha=0.3)
coxph(
Surv(survtime,status) ~ apoe4_alleles,
data = apoe_haps
) %>% summary()
# model
# set ref
apoe_haps$apoe4_doses = relevel(factor(apoe_haps$apoe4_doses),ref="E3/E3")
model = coxph(
Surv(survtime,status) ~ S1QST_Gender + AgeAtRecruitment + pop.GSA.51k +
PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 +
apoe4_doses,
data = apoe_haps %>% filter(pop.GSA.51k != "Ambiguous")
) %>% broom::tidy()
model$std.error
plot_dat = model %>% filter(grepl("apoe",term)) %>%
mutate(term = str_remove_all(term,"apoe4_doses"))
plot_dat$p_simple = sapply(plot_dat$p.value,function(x){
if(x<0.0001){
"<0.0001"
} else if(x<0.001){
"<0.001"
} else if(x<0.01){
"<0.01"
} else {
">0.01"
}
})
plot_dat$label = paste0("HR = ",
round(exp(plot_dat$estimate),1),"; P",plot_dat$p_simple)
plot_dat$lower = exp(plot_dat$estimate-1.96*plot_dat$std.error)
plot_dat$upper = exp(plot_dat$estimate+1.96*plot_dat$std.error)
plot_dat
p1 = ggplot(plot_dat,
aes(exp(estimate),term,label=label))+
geom_point()+
geom_errorbarh(mapping = aes(
xmin = exp(estimate - 1.96*std.error),
xmax = exp(estimate + 1.96*std.error)),
height=0.01,alpha=0.4
)+
theme_bw()+
labs(x="Hazard Ratio for dementia\nrelative to APOE 3/3 genotype",y="APOE genotype")+
geom_vline(xintercept=1,linetype="dashed",alpha=0.3)+
ggrepel::geom_text_repel(nudge_y=0.1)+
scale_x_log10()
png("/home/ivm/apoe/export/survival_plots.png",res=900,units="in",width=8,height=5)
cowplot::plot_grid(p0,p1)
dev.off()
# repeat with controls >60
controls_over_60_dat = apoe_haps %>% filter(AgeAtRecruitment>60 & (is.na(age_at_event) | age_at_event>60 ) )
controls_over_60_dat%>%
dplyr::count(dementia_status)
controls_over_60_dat%>%
group_by(dementia_status) %>%
summarise_at(.vars = vars(AgeAtRecruitment,age_at_event),.funs = c(median,IQR),na.rm=T)
controls_over_60_dat%>%
group_by(dementia_status) %>%
dplyr::count(S1QST_Gender) %>%
mutate(prop = n/sum(n))
controls_over_60_dat%>%
group_by(dementia_status) %>%
dplyr::count(apoe4_doses) %>%
mutate(prop = n/sum(n)) %>%
pivot_wider(values_from = prop,names_from = apoe4_doses,id_cols = dementia_status)
controls_over_60_dat = controls_over_60_dat %>%
mutate(age_at_data_extract = as.numeric(as.Date("2024-01-01",format="%Y-%m-%d") - s1qst_dob)/365.25) %>%
mutate(status = ifelse(dementia_status == "Control",0,1)) %>%
mutate(survtime = ifelse(dementia_status == "Control",age_at_data_extract-60,age_at_event-60))
model = coxph(
Surv(survtime,status) ~ S1QST_Gender + AgeAtRecruitment + pop.GSA.51k +
PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 +
apoe4_doses,
data = controls_over_60_dat %>% filter(pop.GSA.51k != "Ambiguous")
) %>% broom::tidy()
plot_dat = model %>% filter(grepl("apoe",term)) %>%
mutate(term = str_remove_all(term,"apoe4_doses"))
plot_dat$p_simple = sapply(plot_dat$p.value,function(x){
if(x<0.0001){
"<0.0001"
} else if(x<0.001){
"<0.001"
} else if(x<0.01){
"<0.01"
} else {
">0.01"
}
})
plot_dat$label = paste0("HR = ",
round(exp(plot_dat$estimate),1),"; P",plot_dat$p_simple)
plot_dat$lower = exp(plot_dat$estimate-1.96*plot_dat$std.error)
plot_dat$upper = exp(plot_dat$estimate+1.96*plot_dat$std.error)
plot_dat %>% dplyr::select(-2,-3,-4)
# just e4
coxph(
Surv(survtime,status) ~ S1QST_Gender+AgeAtRecruitment + pop.GSA.51k +
PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 +
apoe4_alleles,
data = apoe_haps %>% filter(pop.GSA.51k != "Ambiguous")
) %>% broom::tidy() %>%
filter(grepl("apoe4_alleles",term)) %>%
mutate(HR = exp(estimate)) %>%
mutate(lowerci = exp(estimate - 1.96* std.error)) %>%
mutate(upperci = exp(estimate + 1.96* std.error))
# just e4
coxph(
Surv(survtime,status) ~ S1QST_Gender+AgeAtRecruitment + pop.GSA.51k +
PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 +
apoe4_status,
data = apoe_haps %>% mutate(apoe4_status = ifelse(apoe4_alleles >0,"yes","no")) %>%
filter(pop.GSA.51k != "Ambiguous")) %>% broom::tidy() %>%
filter(grepl("apoe4_status",term)) %>%
mutate(HR = exp(estimate)) %>%
mutate(lowerci = exp(estimate - 1.96* std.error)) %>%
mutate(upperci = exp(estimate + 1.96* std.error))
# gender-stratified
apoe_haps$gender = ifelse(apoe_haps$S1QST_Gender==1,"M","F")
km_fit = survfit(
Surv(survtime,status) ~ apoe4_alleles,
data = apoe_haps
)
p0 = ggsurvplot_facet(km_fit,data=apoe_haps,
facet.by = "gender",
legend.labs = c("APOE E4 -/-","APOE E4 +/-","APOE E4 +/+"),
fun = "pct",
ylab = "Probability of dementia-free survival",
xlab="Age (years)",
palette = "Set1",
legend=c(0.1,0.7),
conf.int = F
) +
scale_x_continuous(labels = function(x) x + 16,breaks = c(12,22,32,42,52,62,72))+
geom_hline(yintercept=50,linetype="dashed",alpha=0.3)
p0
coxph(
Surv(survtime,status) ~ +AgeAtRecruitment + pop.GSA.51k +
PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 +
apoe4_doses,
data = apoe_haps %>% filter(pop.GSA.51k != "Ambiguous" & S1QST_Gender == "1")
) %>% broom::tidy() %>%
filter(grepl("apoe4_doses",term)) %>%
mutate(HR = exp(estimate)) %>%
mutate(lowerci = exp(estimate - 1.96* std.error)) %>%
mutate(upperci = exp(estimate + 1.96* std.error))
# repeat with AD dementia
apoe_haps$apoe4_doses = relevel(factor(apoe_haps$apoe4_doses),ref="E3/E3")
coxph(
Surv(survtime,status) ~ S1QST_Gender+AgeAtRecruitment + pop.GSA.51k +
PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 +
apoe4_doses,
data = apoe_haps %>% filter(pop.GSA.51k != "Ambiguous")
) %>% broom::tidy() %>%
filter(grepl("apoe4_doses",term)) %>%
mutate(HR = exp(estimate)) %>%
mutate(lowerci = exp(estimate - 1.96* std.error)) %>%
mutate(upperci = exp(estimate + 1.96* std.error))
coxph(
Surv(survtime,status) ~ +AgeAtRecruitment + pop.GSA.51k +
PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 +
apoe4_doses,
data = apoe_haps %>% filter(pop.GSA.51k != "Ambiguous" & S1QST_Gender == "2")
) %>% broom::tidy() %>%
filter(grepl("apoe4_doses",term)) %>%
mutate(HR = exp(estimate)) %>% mutate(lowerci = exp(estimate - 1.96* std.error)) %>%
mutate(upperci = exp(estimate + 1.96* std.error))
# interaction
coxph(
Surv(survtime,status) ~ +AgeAtRecruitment + pop.GSA.51k +
PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 +
apoe4_alleles * S1QST_Gender,
data = apoe_haps %>% filter(pop.GSA.51k != "Ambiguous")
) %>% broom::tidy() %>%
filter(grepl("apoe4_alleles",term)) %>%
mutate(HR = exp(estimate)) %>%
mutate(lowerci = exp(estimate - 1.96* std.error)) %>%
mutate(upperci = exp(estimate + 1.96* std.error))
ggplot(apoe_haps,aes(AgeAtRecruitment,age_at_event))+geom_point()+
geom_abline(intercept=0,slope=1)
hist(apoe_haps$AgeAtRecruitment)
# repeat for ad
apoe_haps = apoe_haps %>%
mutate(age_at_data_extract = as.numeric(as.Date("2024-01-01",format="%Y-%m-%d") - s1qst_dob)/365.25) %>%
mutate(status = ifelse(ad_status == "Control",0,1)) %>%
mutate(survtime = ifelse(ad_status == "Control",age_at_data_extract-16,age_at_event_ad-16))
model_ad = coxph(
Surv(survtime,status) ~ S1QST_Gender + AgeAtRecruitment + pop.GSA.51k +
PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 +
apoe4_doses,
data = apoe_haps %>% filter(pop.GSA.51k != "Ambiguous")
) %>% broom::tidy() %>%
filter(grepl("apoe4",term)) %>%
mutate(outcome = "AD")
model_ad %>% mutate(HR = exp(estimate)) %>%
mutate(lowerci = exp(estimate - 1.96* std.error)) %>%
mutate(upperci = exp(estimate + 1.96* std.error))
apoe_haps$ad_status %>% table
km_fit = survfit(
Surv(survtime,status) ~ apoe4_alleles,
data = apoe_haps
)
p0 = ggsurvplot(km_fit,
fun = "pct",
palette = "Set1",conf.int = F,pval = T,
ylab = "Probability of dementia-free survival",
xlab="Age (years)",
legend.labs = c("APOE E4 -/-","APOE E4 +/-","APOE E4 +/+"),
legend=c(0.3,0.7)
)
p0 = p0$plot+
scale_x_continuous(labels = function(x) x + 16,breaks = c(12,22,32,42,52,62,72))+
geom_hline(yintercept=50,linetype="dashed",alpha=0.3)
# repeat for VaD
apoe_haps = apoe_haps %>%
mutate(age_at_data_extract = as.numeric(as.Date("2024-01-01",format="%Y-%m-%d") - s1qst_dob)/365.25) %>%
mutate(status = ifelse(vad_status == "Control",0,1)) %>%
mutate(survtime = ifelse(vad_status == "Control",age_at_data_extract-16,age_at_event_vad-16))
model_vad = coxph(
Surv(survtime,status) ~ S1QST_Gender + AgeAtRecruitment + pop.GSA.51k +
PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 +
apoe4_doses,
data = apoe_haps %>% filter(pop.GSA.51k != "Ambiguous")
) %>% broom::tidy() %>%
filter(grepl("apoe4",term)) %>%
mutate(outcome = "VaD")
# custom binary trait phewas
res = list()
pheno_list = list.files("/genesandhealth/library-red/genesandhealth/phenotypes_curated/version010_2025_04/BI_PY/outputs/custom_phenotypes/individual_trait_files/",pattern = "csv")
for(i in c(1:length(pheno_list))){
phenotype = pheno_list[i]
# read in
pheno_dat = read_csv(paste0("/genesandhealth/library-red/genesandhealth/phenotypes_curated/version010_2025_04/BI_PY/outputs/custom_phenotypes/individual_trait_files/",phenotype))
if(nrow(pheno_dat)>500){
# get short name
pheno_name = pheno_dat$phenotype[1]
# take earliest
pheno_dat = pheno_dat %>%
group_by(nhs_number) %>% slice_min(date,n=1,with_ties = F) %>% ungroup()
# join with oragene
pheno_dat = pheno_dat %>%
inner_join(link_file,by="nhs_number")
# join apoe data with phenotype
pheno_dat = apoe_haps %>%
mutate(IID = as.numeric(IID)) %>%
dplyr::select(-phenotype,-age_at_event) %>%
left_join(pheno_dat,by="IID") %>%
mutate(pheno_status = ifelse(is.na(age_at_event),"Control","Case"))
# exclude ambiguous ancestry
pheno_dat = pheno_dat %>% filter(pop.GSA.51k != "Ambiguous")
# set ref
pheno_dat$apoe4_doses = relevel(factor(pheno_dat$apoe4_doses),ref="E3/E3")
model = glm(data = pheno_dat %>%
mutate(pheno_status = ifelse(pheno_status == "Case",1,0)),
pheno_status ~ S1QST_Gender + AgeAtRecruitment + pop.GSA.51k +
PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 +
apoe4_doses,
family=binomial(link="logit")) %>%
broom::tidy() %>%
filter(grepl("apoe",term)) %>%
mutate(phenotype = pheno_name)
res[[i]] = model
}
}
# combine res
res = do.call("bind_rows",res)
# repeat for quant traits
res_quant = list()
pheno_list_quant = list.files("/genesandhealth/library-red/genesandhealth/phenotypes_curated/version010_2025_04/QUANT_PY/outputs/individual_trait_files/all/",pattern = "per_individual")
for(i in c(1:length(pheno_list_quant))){
phenotype = pheno_list_quant[i]
# read in
pheno_dat = read_csv(paste0("/genesandhealth/library-red/genesandhealth/phenotypes_curated/version010_2025_04/QUANT_PY/outputs/individual_trait_files/all/",
phenotype)
)
if(nrow(pheno_dat)>5000){
# get short name
pheno_name = pheno_dat$trait[1]
# join with oragene
pheno_dat = pheno_dat %>%
dplyr::select(pseudo_nhs_number,median) %>%
dplyr::rename("nhs_number" = pseudo_nhs_number) %>%
inner_join(link_file,by="nhs_number")
# join apoe data with phenotype
pheno_dat = apoe_haps %>%
mutate(IID = as.numeric(IID)) %>%
left_join(pheno_dat,by="IID")
# exclude ambiguous ancestry
pheno_dat = pheno_dat %>% filter(pop.GSA.51k != "Ambiguous")
# set ref
pheno_dat$apoe4_doses = relevel(factor(pheno_dat$apoe4_doses),ref="E3/E3")
# rank norm
model_dat = pheno_dat %>% filter(!is.na(median))
model_dat$pheno_norm = RNOmni::RankNorm(model_dat$median)
model = lm(data = model_dat,
pheno_norm ~ S1QST_Gender + AgeAtRecruitment + pop.GSA.51k +
PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 +
apoe4_doses) %>%
broom::tidy() %>%
filter(grepl("apoe",term)) %>%
mutate(phenotype = pheno_name)
res_quant[[i]] = model
}
}
# combine binary & quant
# arrange by p
res_quant =do.call("bind_rows",res_quant)
# combine
res = bind_rows(res,res_quant)
res = res %>% arrange(p.value)
# bonf
res$phenotype %>% unique %>% length
# clean
res$term = str_remove_all(res$term,"apoe4_doses")
# plot
res$fdr = p.adjust(res$p.value,method="fdr")
res$bonf = p.adjust(res$p.value,method="bonf")
res$direction = ifelse(res$estimate<0,"decreases","increases")
# plot
res$phenotype = str_replace_all(res$phenotype,"_"," ")
res$phenotype = factor(res$phenotype,ordered=T,levels = unique(res$phenotype))
png("/home/ivm/apoe/export/phewas.png",res=900,units="in",width=8,height=5)
ggplot(res %>% filter(fdr<0.05),aes(phenotype,-log10(p.value),col=term,label=phenotype,shape = direction))+
geom_point(size=3)+
theme_bw()+
scale_color_brewer(palette="Set1")+
labs(x="Phenotype",y=bquote(-log[10]~P),shape="Effect direction",color="Genotype")+
coord_flip()+
geom_hline(yintercept=-log10(0.05/236),alpha=0.1,linetype="dashed")
dev.off()
res$phenotype %>% unique()
ggplot(plot_dat,
aes(term,phenotype,fill=direction,alpha = -log10(p.value)/max(-log10(sighits$p.value)),label = bonf_simple))+
geom_tile(color="black")+
theme_bw()+
scale_fill_manual(values = c("blue","red"))+
geom_text(alpha=1,size=5)+
guides(alpha="none")+
labs(fill="Direction of effect\n(relative to E3/E3)",x="APOE genotype",y="Phenotype")+
theme(panel.grid = element_blank())
res %>% filter(fdr<0.05)
ggplot(res,aes(phenotype,-log10(p.value),fill=term,label = paste0(term," ",direction," ",phenotype)))+
geom_point(data= res %>% filter(fdr < 0.05),shape=21)+
theme_bw()+
theme(axis.text.x = element_blank(),axis.line.x = element_blank(),axis.ticks.x = element_blank())+
ggrepel::geom_text_repel(data = res %>% filter(bonf<0.05),min.segment.length = 0,size=3,nudge_x = 20)