Preamble

This markdown doc contains code used to analyse the association of APOE alleles with dementia in Genes & Health. Ben Jacobs 14-08-25

Extract haplotypes from imputed genotype data

cd /home/ivm/apoe/scratch
plink2 --vcf /genesandhealth/library-red/genesandhealth/GSAv3EAMD/Oct2023_51k_TOPMED-r2_Imputation_b38/unfiltered_merged_vcfs/chr19.dose.merged.vcf.gz \
--chr 19 \
--from-bp 44908684 \
--to-bp 44908822 \
--make-pfile \
--out apoe_region

cat /home/ivm/apoe/scratch/apoe_region.pvar | grep -v -E "##"

echo "chr19:44908684:T:C" > snps_to_keep
echo "chr19:44908822:C:T" >> snps_to_keep

plink2 --pfile apoe_region \
--extract snps_to_keep \
--export haps \
--out apoe_haplotypes

Clean haplotypes

library(tidyverse)

# read in haps 
apoe_haps = read_table("/home/ivm/apoe/scratch/apoe_haplotypes.haps",col_names = F)
apoe_samples = read_table("/home/ivm//apoe/scratch/apoe_haplotypes.sample",col_names = T) %>%
  filter(ID_2 != 0)
colnames(apoe_haps)[c(1:5)] = c("CHR","SNP","POS","REF","ALT")
colnames = list()
for(i in c(1:length(apoe_samples$ID_2))){
  colnames[[length(colnames)+1]] = paste0(apoe_samples$ID_2[i],"_mat")
  colnames[[length(colnames)+1]] = paste0(apoe_samples$ID_2[i],"_pat")
  
}
colnames(apoe_haps)[c(6:ncol(apoe_haps))] = unlist(colnames)

# spread long
apoe_haps = apoe_haps %>% 
  pivot_longer(cols = c(6:ncol(apoe_haps))) %>%
  pivot_wider(id_cols = name, values_from = value, names_from = SNP) 

apoe_haps %>%
  dplyr::count(`chr19:44908684:T:C`) %>%
  mutate(prop = n/sum(n))

# define haplotypes
apoe_haps = apoe_haps %>%
  mutate(apoe_status = case_when(
    `chr19:44908684:T:C` == 0 & `chr19:44908822:C:T` == 1 ~ "APOE4",
    `chr19:44908684:T:C` == 1 & `chr19:44908822:C:T` == 1 ~ "APOE3",
    `chr19:44908684:T:C` == 1 & `chr19:44908822:C:T` == 0 ~ "APOE2",
    `chr19:44908684:T:C` == 0 & `chr19:44908822:C:T` == 0 ~ "APOE1"
  )) %>%
  separate(name,sep = "_", into = c("IID","FID","other","chrom")) %>%
  dplyr::select(-FID,-other) %>%
  pivot_wider(id_cols = IID,values_from = apoe_status,names_from = chrom) %>%
  mutate(apoe4_doses = 
           case_when(mat == "APOE4" & pat == "APOE4" ~ "E4/E4",
                     mat == "APOE3" & pat == "APOE3" ~ "E3/E3",
                     mat == "APOE2" & pat == "APOE2" ~ "E2/E2",
                     mat == "APOE1" & pat == "APOE1" ~ "E1/E1",
                     mat == "APOE4" & pat == "APOE3" ~ "E4/E3",
                     mat == "APOE3" & pat == "APOE4" ~ "E4/E3",
                     mat == "APOE4" & pat == "APOE2" ~ "E4/E2",
                     mat == "APOE2" & pat == "APOE4" ~ "E4/E2",
                     mat == "APOE4" & pat == "APOE1" ~ "E4/E1",
                     mat == "APOE1" & pat == "APOE4" ~ "E4/E1",
                     mat == "APOE3" & pat == "APOE2" ~ "E3/E2",
                     mat == "APOE2" & pat == "APOE3" ~ "E3/E2",
                     mat == "APOE3" & pat == "APOE1" ~ "E3/E1",
                     mat == "APOE1" & pat == "APOE3" ~ "E3/E1",
                     mat == "APOE2" & pat == "APOE1" ~ "E2/E1",
                     mat == "APOE1" & pat == "APOE2" ~ "E2/E1"
           ))


# get just e4 dose
apoe_haps = apoe_haps %>%
  mutate(apoe4_alleles = case_when(
    apoe4_doses == "E4/E4" ~ 2,
    apoe4_doses == "E4/E2" ~ 1,
    apoe4_doses == "E4/E3" ~ 1,
    apoe4_doses == "E4/E1" ~ 1,
    .default = 0))

apoe_haps = apoe_haps %>%
  mutate(apoe3_alleles = case_when(
    mat == "APOE3" & pat == "APOE3" ~ 2,
    mat == "APOE3" & pat != "APOE3" ~ 1,
    mat != "APOE3" & pat == "APOE3" ~ 1,
    mat != "APOE3" & pat != "APOE3" ~ 0,
    .default = 0))

apoe_haps = apoe_haps %>%
  mutate(apoe2_alleles = case_when(
    mat == "APOE2" & pat == "APOE2" ~ 2,
    mat == "APOE2" & pat != "APOE2" ~ 1,
    mat != "APOE2" & pat == "APOE2" ~ 1,
    mat != "APOE2" & pat != "APOE2" ~ 0,
    .default = 0))

Allele frequency

# binomial tests 
binom.test(x = 10016,n = (50490+614),p = 0.288,alternative = "less")
binom.test(x = 49280,n = 50490,p = 0.944)

# AF heatmap 

props = apoe_haps %>% 
  dplyr::count(apoe4_doses) %>% 
  mutate(pct_gh = 100*n/sum(n)) %>% 
  dplyr::select(-n) 

props_ukb = data.frame(
  apoe4_doses = c("E2/E2", "E3/E2", "E4/E2", "E3/E3", "E4/E3", "E4/E4"),
  pct_ukb = c(0.6, 12.3, 2.6, 58.2, 23.9, 2.4)
)

props = props %>% left_join(props_ukb,by="apoe4_doses") 
colnames(props) = c("genotype","Genes & Health (SAS)","UK Biobank (EUR)")
props = props %>%
  pivot_longer(cols = c(2,3))

props = props %>% arrange(genotype)
props$genotype = factor(props$genotype,levels = unique(props$genotype)) 
props = props %>% group_by(name) %>% mutate(cumsum = cumsum(value)) %>% 
  mutate(pos = 100-cumsum)
props$pos2 = c(100,100,props$pos[-c(11,12)])
props = props %>% 
  ungroup %>% 
  mutate(midpoint = (pos + pos2 ) / 2)
png("/home/ivm/apoe/export/afs.png",res=900,units="in",width=6,height=6)
ggplot(props,aes(name,value,fill=genotype))+
  geom_col(color="black")+
  scale_fill_brewer(palette="Set1")+
  theme_bw()+
  labs(fill="APOE allele",x="Cohort",y="Percentage of cohort (%)")+
  geom_text(data = props %>% mutate(label = ifelse(value < 5," ",paste0(round(value,1),"%"))),
            mapping = aes(label = label,y=midpoint))
dev.off()

Add additional phenotypes & covariates

# get phenotype data 
dementia = read_csv("/genesandhealth/library-red/genesandhealth/phenotypes_curated/version010_2025_04/BI_PY/outputs/custom_phenotypes/individual_trait_files/2025_05_Dementia_summary_report.csv")
ad = read_csv("/genesandhealth/library-red/genesandhealth/phenotypes_curated/version010_2025_04/BI_PY/outputs/icd10/individual_trait_files/3_digit_icd/2025_05_G30_summary_report.csv")
dementia_nos = read_csv("/genesandhealth/library-red/genesandhealth/phenotypes_curated/version010_2025_04/BI_PY/outputs/icd10/individual_trait_files/3_digit_icd/2025_05_F03_summary_report.csv")
vad = read_csv("/genesandhealth/library-red/genesandhealth/phenotypes_curated/version010_2025_04/BI_PY/outputs/icd10/individual_trait_files/3_digit_icd/2025_05_F01_summary_report.csv")

# combine all 
bind_rows(dementia,ad,dementia_nos,vad) %>% distinct(nhs_number)

link_file = read_csv("/genesandhealth/library-red/genesandhealth/2025_02_10__MegaLinkage_forTRE.csv") %>%
  dplyr::rename("IID" = OrageneID,"nhs_number"=`pseudonhs_2024-07-10`) %>%
  dplyr::select(IID,nhs_number)
dob = read_csv("/genesandhealth/library-red/genesandhealth/phenotypes_rawdata/QMUL__Stage1Questionnaire/2025_04_25__S1QSTredacted.csv") %>% 
  mutate(s1qst_dob = as.Date(paste0("01-",`S1QST_MM-YYYY_ofBirth`),format="%d-%m-%Y")) %>% 
  mutate(IID = as.character(S1QST_Oragene_ID)) %>%
  dplyr::select(IID,s1qst_dob)

dementia = dementia %>%
  inner_join(link_file,by="nhs_number")

# take earliest date 
dementia = dementia %>% group_by(nhs_number) %>% slice_min(date,n=1,with_ties = F) %>% ungroup()
  
# join haplotypes with dementia status
apoe_haps = apoe_haps %>%
  mutate(IID = as.numeric(IID)) %>%
  left_join(dementia,by="IID") %>%
  mutate(dementia_status = ifelse(is.na(age_at_event),"Control","Case"))

# join with dob 
apoe_haps = apoe_haps %>% 
  inner_join(dob %>% mutate(IID = as.numeric(IID)),by="IID")  

# add covars
cov = read_tsv("/genesandhealth/library-red/genesandhealth/GSAv3EAMD/Oct2023_51k_TOPMED-r2_Imputation_b38/GNH.51170.noEthnicOutliers.covariates.20PCs.tab") 
apoe_haps = apoe_haps %>% 
  mutate(IID = as.character(IID)) %>% 
  inner_join(cov %>%
               mutate(IID = as.character(OrageneID)),by="IID")  

# add other phenotypes 
## vad
apoe_haps = apoe_haps %>%
  left_join(vad  %>%
              inner_join(link_file,by="nhs_number") %>% 
              dplyr::rename("age_at_event_vad" = age_at_event) %>% 
              mutate(IID = as.character(IID)),by="IID") %>%
  mutate(vad_status = ifelse(is.na(age_at_event_vad),"Control","Case"))

## AD
apoe_haps = apoe_haps %>%
  left_join(ad  %>%
              inner_join(link_file,by="nhs_number") %>% 
              dplyr::rename("age_at_event_ad" = age_at_event) %>% 
              mutate(IID = as.character(IID)),by="IID") %>%
  mutate(ad_status = ifelse(is.na(age_at_event_ad),"Control","Case"))

## NOS
apoe_haps = apoe_haps %>%
  left_join(dementia_nos  %>%
              inner_join(link_file,by="nhs_number") %>% 
              dplyr::rename("age_at_event_nos" = age_at_event) %>% 
              mutate(IID = as.character(IID)),by="IID") %>%
  mutate(dementia_nos__status = ifelse(is.na(age_at_event_nos),"Control","Case"))


apoe_haps %>% glimpse()

Demographics

# survival analysis 
library(survival)
library(survminer)

# define event
apoe_haps = apoe_haps %>% 
  mutate(age_at_data_extract = as.numeric(as.Date("2024-01-01",format="%Y-%m-%d") - s1qst_dob)/365.25) %>%
  mutate(status = ifelse(dementia_status == "Control",0,1)) %>% 
  mutate(survtime = ifelse(dementia_status == "Control",age_at_data_extract-16,age_at_event-16))  

# demographics 
apoe_haps
hist(apoe_haps$AgeAtRecruitment)
summary(apoe_haps$AgeAtRecruitment)

# exclude ambiguous 
apoe_haps = apoe_haps %>% filter(pop.GSA.51k!="Ambiguous")
nrow(apoe_haps)
apoe_haps %>% nrow()
apoe_haps %>% filter(AgeAtRecruitment<18)
apoe_haps %>% dplyr::count(dementia_status) 
apoe_haps %>% dplyr::count(apoe4_alleles) %>% mutate(prop = n/sum(n))
apoe_haps %>% dplyr::count(apoe3_alleles) %>% mutate(prop = n/sum(n))

# get AFs
nchroms = nrow(apoe_haps)*2
apoe_haps %>% dplyr::count(apoe2_alleles) %>% mutate(n_dose = n*apoe2_alleles) %>% mutate(af = sum(n_dose)/ nchroms) 
apoe_haps %>% dplyr::count(apoe3_alleles) %>% mutate(n_dose = n*apoe3_alleles) %>% mutate(af = sum(n_dose)/ nchroms) 
apoe_haps %>% dplyr::count(apoe4_alleles) %>% mutate(n_dose = n*apoe4_alleles) %>% mutate(af = sum(n_dose)/nchroms) 

# stratified 
apoe_haps %>% group_by(dementia_status) %>% dplyr::count(apoe2_alleles) %>% mutate(n_dose = n*apoe2_alleles) %>% mutate(af = sum(n_dose)/ (2*sum(n))) 
apoe_haps %>% group_by(dementia_status) %>% dplyr::count(apoe3_alleles) %>% mutate(n_dose = n*apoe3_alleles) %>% mutate(af = sum(n_dose)/ (2*sum(n))) 
apoe_haps %>% group_by(dementia_status) %>% dplyr::count(apoe4_alleles) %>% mutate(n_dose = n*apoe4_alleles) %>% mutate(af = sum(n_dose)/ (2*sum(n))) 

# demographics
apoe_haps %>% summarise_at(vars(AgeAtRecruitment),.funs = c("median","IQR"))

apoe_haps %>% group_by(dementia_status) %>% summarise_at(vars(AgeAtRecruitment),.funs = c("median","IQR"))
apoe_haps %>%  group_by(dementia_status) %>% dplyr::count(S1QST_Gender) %>% mutate(pct = n/sum(n)*100)
apoe_haps  %>% dplyr::count(S1QST_Gender) %>% mutate(pct = n/sum(n)*100)

apoe_haps %>% dplyr::count(apoe4_doses) %>% mutate(pct = n/sum(n)*100) %>% 
  mutate(lab = paste0(n," ",round(pct,1),"%"))


apoe_haps %>% group_by(dementia_status) %>% dplyr::count(apoe4_doses) %>% mutate(pct = n/sum(n)*100) %>% 
  mutate(lab = paste0(n," ",round(pct,1),"%")) %>%
  pivot_wider(id_cols = apoe4_doses,values_from = lab,names_from = dementia_status)
apoe_haps %>% group_by(dementia_status) %>% dplyr::count(apoe4_alleles) %>% mutate(pct = n/sum(n)*100) %>% 
  mutate(lab = paste0(n," ",round(pct,1),"%")) %>%
  pivot_wider(id_cols = apoe4_alleles,values_from = lab,names_from = dementia_status)

apoe_haps %>% group_by(dementia_status) %>% dplyr::count(pop.GSA.51k) %>% mutate(pct = n/sum(n)*100) %>% arrange(pct)
apoe_haps %>% filter(dementia_status=="Case") %>% summarise_at(vars(age_at_event),.funs = c("median","IQR"),na.rm=T)

apoe_haps %>% group_by(pop.GSA.51k) %>% 
  dplyr::count(apoe4_alleles) %>% 
  mutate(n_alleles = apoe4_alleles*n) %>%
  summarise(total_alleles = sum(n_alleles),total_chroms = 2*sum(n)) %>% 
  mutate(af = total_alleles / total_chroms)

cases = apoe_haps %>% filter(dementia_status=="Case") %>% mutate(case_type = ifelse(age_at_event > AgeAtRecruitment,"Incident","Prevalent"))
cases %>% 
  dplyr::count(case_type) %>%
  mutate(total = sum(n),prop = n/total)

Models, survival, PAF

# unadjusted
glm(data = apoe_haps %>%
      mutate(dementia_status = ifelse(dementia_status == "Case",1,0)),
    dementia_status ~ apoe4_doses,
    family=binomial(link="logit")) %>% 
  summary()

# adjusted 

# set ref 
apoe_haps$apoe4_doses = relevel(factor(apoe_haps$apoe4_doses),ref="E3/E3")
model = glm(data = apoe_haps %>%
              mutate(dementia_status = ifelse(dementia_status == "Case",1,0)),
            dementia_status ~ S1QST_Gender + AgeAtRecruitment + pop.GSA.51k + 
              PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 + 
              apoe4_doses,
            family=binomial(link="logit")) %>% 
  broom::tidy()
model
coefs = model %>% 
  filter(grepl("apoe4",term)) %>% 
  mutate(term = str_remove_all(term,"apoe4_doses")) %>% 
  bind_rows(
    data.frame(
      term = "E3/E3",estimate=0,std.error =0.,statistic=NA,p.value=1
    )
  ) %>% 
  mutate(or = exp(estimate)) %>% 
  mutate(lowerci = exp(estimate-1.96*std.error)) %>% 
  mutate(upperci = exp(estimate+1.96*std.error)) 
coefs

model = glm(data = apoe_haps %>%
              mutate(dementia_status = ifelse(dementia_status == "Case",1,0)),
            dementia_status ~ S1QST_Gender + AgeAtRecruitment + pop.GSA.51k + 
              PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 + 
              apoe4_doses,
            family=binomial(link="logit")) %>% 
  broom::tidy()

# repeat using e2/e2 as ref
# set ref 
apoe_haps$apoe4_doses = relevel(factor(apoe_haps$apoe4_doses),ref="E2/E2")
model = glm(data = apoe_haps %>%
              mutate(dementia_status = ifelse(dementia_status == "Case",1,0)),
            dementia_status ~ S1QST_Gender + AgeAtRecruitment + pop.GSA.51k + 
              PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 + 
              apoe4_doses,
            family=binomial(link="logit")) %>% 
  broom::tidy()
model
coefs = model %>% 
  filter(grepl("apoe4",term)) %>% 
  mutate(term = str_remove_all(term,"apoe4_doses")) %>% 
  bind_rows(
    data.frame(
      term = "E2/E2",estimate=0,std.error =0.,statistic=NA,p.value=1
    )
  ) %>% 
  mutate(or = exp(estimate)) %>% 
  mutate(lowerci = exp(estimate-1.96*std.error)) %>% 
  mutate(upperci = exp(estimate+1.96*std.error)) 
coefs
apoe_haps$apoe4_doses = relevel(factor(apoe_haps$apoe4_doses),ref="E3/E3")

# repeat for apoe4 alone 
glm(data = apoe_haps %>%
              mutate(dementia_status = ifelse(dementia_status == "Case",1,0)) %>% 
              mutate(apoe4_carrier = ifelse(apoe4_alleles>0,"yes","no")),
            dementia_status ~ S1QST_Gender + AgeAtRecruitment + pop.GSA.51k + 
              PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 + 
           apoe4_carrier,
            family=binomial(link="logit")) %>% 
  broom::tidy() %>% 
  filter(term == "apoe4_carrieryes") %>% 
  mutate(or = exp(estimate))

glm(data = apoe_haps %>%
      mutate(dementia_status = ifelse(dementia_status == "Case",1,0)) %>% 
      mutate(apoe4_carrier = ifelse(apoe4_alleles>0,"yes","no")),
    dementia_status ~ S1QST_Gender + AgeAtRecruitment + pop.GSA.51k + 
      PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 + 
      apoe4_carrier,
    family=binomial(link="log")) %>% 
  broom::tidy() %>% 
  filter(term == "apoe4_carrieryes") %>% 
  mutate(rr = exp(estimate))

# PAF for apoe4 specifically 

## prop exposed to apoe4
props = apoe_haps %>% dplyr::count(apoe4_alleles != 0) %>% mutate(prop = n/sum(n))
prop_exp = props[props$`apoe4_alleles != 0`==T,]$prop
props_by_case_status = apoe_haps %>% group_by(dementia_status) %>% 
  dplyr::count(apoe4_alleles != 0) %>% mutate(prop = n/sum(n)) %>% 
  filter(dementia_status=="Case")
prop_exp_cases = props_by_case_status[props_by_case_status$`apoe4_alleles != 0`==T,]$prop



## rr given apoe4 
tbl = apoe_haps %>% 
  group_by(apoe4_alleles != 0) %>% 
  dplyr::count(dementia_status)%>% 
  mutate(prop = n/sum(n)) %>%
  filter(dementia_status=="Case")
rr = tbl[tbl$`apoe4_alleles != 0`==T,]$prop / tbl[tbl$`apoe4_alleles != 0`==F,]$prop

## manual calc 
rr_manual = 1.44
prev_manual = 0.195562
paf_levin = ( prev_manual * (rr_manual - 1) ) / ( prev_manual * (rr_manual - 1) + 1)

## function to compute PAFs dichotomously

paf_calc_dichotomous = function(beta,se,prop_exposure,prop_exposure_cases){
  
  # Levin dichotomous
  rr = exp(beta)
  lower_rr = exp(beta-1.96*se)
  upper_rr = exp(beta+1.96*se)
  
  paf_levin = ( prop_exposure * (rr - 1) ) / ( prop_exposure * (rr - 1) + 1)
  paf_levin_lower = ( prop_exposure * (lower_rr - 1) ) / ( prop_exposure * (lower_rr - 1) + 1)
  paf_levin_upper = ( prop_exposure * (upper_rr - 1) ) / ( prop_exposure * (upper_rr - 1) + 1)

  # Miettenin dichotomous
  paf_miettenin = ( (rr-1) / rr) * prop_exposure_cases
  paf_miettenin_lower = ( (lower_rr-1) / lower_rr) * prop_exposure_cases
  paf_miettenin_upper = ( (upper_rr-1) / upper_rr) * prop_exposure_cases
  
  data.frame(
    "PAF" = c(paf_levin,paf_miettenin),
    "Lower_CI" = c(paf_levin_lower,paf_miettenin_lower),
    "Upper_CI" = c(paf_levin_upper,paf_miettenin_upper),
    "Method" = c("Levin","Miettenin") 
  )
  
}

res = paf_calc_dichotomous(0.48,0.101,prop_exp,prop_exp_cases) %>%
  mutate(exposure = "APOE4 (dominant)")

# Repeat for e3/e4

## prop exposed to apoe4

apoe_haps = apoe_haps %>% 
  mutate(apoe3_or_4_carrier = ifelse(mat == "APOE3" | pat =="APOE3" | mat == "APOE4" | pat =="APOE4","yes","no"))
props = apoe_haps %>% dplyr::count(apoe3_or_4_carrier) %>% mutate(prop = n/sum(n))
prop_exp = props[props$apoe3_or_4_carrier=="yes",]$prop
props_by_case_status = apoe_haps %>% group_by(dementia_status) %>% 
  dplyr::count(apoe3_or_4_carrier) %>% mutate(prop = n/sum(n)) %>% 
  filter(dementia_status=="Case")
prop_exp_cases = props_by_case_status[props_by_case_status$apoe3_or_4_carrier=="yes",]$prop

# repeat for apoe4 or apoe3 alone 
glm(data = apoe_haps %>%
      mutate(dementia_status = ifelse(dementia_status == "Case",1,0)),
    dementia_status ~ S1QST_Gender + AgeAtRecruitment + pop.GSA.51k + 
      PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 + 
      apoe3_or_4_carrier,
    family=binomial(link="logit")) %>% 
  broom::tidy() %>% 
  filter(term == "apoe3_or_4_carrieryes") %>% 
  mutate(or = exp(estimate))


## rr given apoe4 or apoe3
tbl = apoe_haps %>% 
  group_by(apoe3_or_4_carrier) %>% 
  dplyr::count(dementia_status)%>% 
  mutate(prop = n/sum(n)) %>%
  filter(dementia_status=="Case")
rr = tbl[tbl$apoe3_or_4_carrier=="yes",]$prop / tbl[tbl$apoe3_or_4_carrier=="no",]$prop

res2 = paf_calc_dichotomous(0.983,1.07,prop_exp,prop_exp_cases) %>%
  mutate(exposure = "APOE3 or 4 (dominant)")


# stratified method
## just apoe4 
case_counts = apoe_haps %>% 
  group_by(apoe4_alleles) %>% 
  dplyr::count(dementia_status) %>% 
  filter(dementia_status == "Case") %>% 
  ungroup() %>%
  mutate(case_fraction = n/sum(n))%>%
  mutate(apoe4_alleles = as.character(apoe4_alleles))
geno_prev = apoe_haps %>% 
  dplyr::count(apoe4_alleles) %>% 
  mutate(prev = n/sum(n))%>%
  mutate(apoe4_alleles = as.character(apoe4_alleles))

pafs = glm(data = apoe_haps %>%
             mutate(apoe4_alleles = as.character(apoe4_alleles)) %>%
             mutate(dementia_status = ifelse(dementia_status == "Case",1,0)),
           dementia_status ~ S1QST_Gender + AgeAtRecruitment + pop.GSA.51k + 
             PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 + 
             apoe4_alleles,
           family=binomial(link="logit")) %>% 
  broom::tidy() %>% 
  filter(grepl("apoe",term)) %>%
  mutate(apoe4_alleles = str_remove_all(term,"apoe4_alleles")) %>%
  mutate(or = exp(estimate)) %>%
  mutate(lower = exp(estimate - 1.96*std.error)) %>%
  mutate(upper = exp(estimate + 1.96*std.error)) %>%
  left_join(case_counts,by="apoe4_alleles") %>%
  left_join(geno_prev %>% mutate(total = n),by="apoe4_alleles") %>% 
  mutate(paf_est = (or-1)/or*case_fraction) %>%
  mutate(paf_est_lower = (lower-1)/lower*case_fraction) %>%
  mutate(paf_est_upper = (upper-1)/upper*case_fraction) %>% 
  dplyr::select(apoe4_alleles,contains("paf"))
pafs
pafs %>% dplyr::select(-1) %>% colSums()



## apoe3 or apoe4
case_counts = apoe_haps %>% 
  group_by(apoe4_doses) %>% 
  dplyr::count(dementia_status) %>% 
  filter(dementia_status == "Case") %>% 
  ungroup() %>%
  mutate(case_fraction = n/sum(n))
geno_prev = apoe_haps %>% 
  dplyr::count(apoe4_doses) %>% 
  mutate(prev = n/sum(n))

apoe_haps$apoe4_doses = relevel(apoe_haps$apoe4_doses,ref="E2/E2")
pafs = glm(data = apoe_haps %>%
      mutate(dementia_status = ifelse(dementia_status == "Case",1,0)),
    dementia_status ~ S1QST_Gender + AgeAtRecruitment + pop.GSA.51k + 
      PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 + 
      apoe4_doses,
    family=binomial(link="logit")) %>% 
  broom::tidy() %>% 
  filter(grepl("apoe",term)) %>%
  mutate(apoe4_doses = str_remove_all(term,"apoe4_doses")) %>%
  mutate(or = exp(estimate)) %>%
  mutate(lower = exp(estimate - 1.96*std.error)) %>%
  mutate(upper = exp(estimate + 1.96*std.error)) %>%
  left_join(case_counts,by="apoe4_doses") %>%
  left_join(geno_prev %>% mutate(total = n),by="apoe4_doses") %>% 
  mutate(paf_est = (or-1)/or*case_fraction) %>%
  mutate(paf_est_lower = (lower-1)/lower*case_fraction) %>%
  mutate(paf_est_upper = (upper-1)/upper*case_fraction) %>% 
  dplyr::select(apoe4_doses,contains("paf"))
pafs

# get ratios 
model = glm(data = apoe_haps %>%
      mutate(dementia_status = ifelse(dementia_status == "Case",1,0)),
    dementia_status ~ S1QST_Gender + AgeAtRecruitment + pop.GSA.51k + 
      PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 + 
      apoe4_doses,
    family=binomial(link="logit")) %>% 
  broom::tidy() %>% 
  filter(grepl("apoe",term)) %>% 
  mutate(apoe4_doses = str_remove_all(term,"apoe4_doses"))  %>%
  mutate(or = exp(estimate)) %>%
  mutate(lower = exp(estimate - 1.96*std.error)) %>%
  mutate(upper = exp(estimate + 1.96*std.error)) %>% 
  dplyr::select(apoe4_doses,or,lower,upper)
# ratios
ratio = model[model$apoe4_doses=="E4/E2",]$or/model[model$apoe4_doses=="E3/E2",]$or
upperratio = model[model$apoe4_doses=="E4/E2",]$upper/model[model$apoe4_doses=="E3/E2",]$lower
lowerratio = model[model$apoe4_doses=="E4/E2",]$lower/model[model$apoe4_doses=="E3/E2",]$upper

# e4 calc 
e4 = data.frame(
  apoe4_doses = "Combined E4",
  paf_est = pafs[pafs$apoe4_doses=="E4/E2",]$paf_est + pafs[pafs$apoe4_doses=="E4/E4",]$paf_est + (ratio / (ratio+1) ) * pafs[pafs$apoe4_doses=="E4/E3",]$paf_est,
  paf_est_lower = pafs[pafs$apoe4_doses=="E4/E2",]$paf_est_lower + pafs[pafs$apoe4_doses=="E4/E4",]$paf_est_lower + (lowerratio / (lowerratio+1) ) * pafs[pafs$apoe4_doses=="E4/E3",]$paf_est_lower,
  paf_est_upper = pafs[pafs$apoe4_doses=="E4/E2",]$paf_est_upper + pafs[pafs$apoe4_doses=="E4/E4",]$paf_est_upper + (upperratio / (upperratio+1) ) * pafs[pafs$apoe4_doses=="E4/E3",]$paf_est_upper 
)

# e3 calc
e3 = data.frame(
  apoe4_doses = "Combined E3",
  paf_est = pafs[pafs$apoe4_doses=="E3/E2",]$paf_est + pafs[pafs$apoe4_doses=="E3/E3",]$paf_est + ( 1 - (ratio / (ratio+1) ) )* pafs[pafs$apoe4_doses=="E4/E3",]$paf_est,
  paf_est_lower = pafs[pafs$apoe4_doses=="E3/E2",]$paf_est_lower + pafs[pafs$apoe4_doses=="E3/E3",]$paf_est_lower + (1 - (lowerratio / (lowerratio+1) ) ) * pafs[pafs$apoe4_doses=="E4/E3",]$paf_est_lower, 
  paf_est_upper = pafs[pafs$apoe4_doses=="E3/E2",]$paf_est_upper + pafs[pafs$apoe4_doses=="E3/E3",]$paf_est_upper + (1 - (upperratio / (upperratio+1) ) )* pafs[pafs$apoe4_doses=="E4/E3",]$paf_est_upper )

data.frame(apoe4_doses = "E3 & E4")
bind_rows(e4,e3,pafs) 

# crude 
pafs = pafs %>% dplyr::select(-1) %>% colSums() %>%
  t() %>%
  data.frame() %>%
  mutate(apoe4_doses = "Combined E3 & E4") %>%
  bind_rows(pafs,e3,e4) 
pafs$apoe4_doses = factor(pafs$apoe4_doses,
                          levels = c("Combined E3 & E4","Combined E3","Combined E4","E3/E2","E3/E3","E4/E2","E4/E3","E4/E4"
                                     ))
pafs = pafs %>% mutate(col = ifelse(grepl("Combined",apoe4_doses),"Combined","Other"))
ggplot(pafs,aes(apoe4_doses,100*paf_est,col=col,label=paste0(round(100*paf_est,1),"%")))+
  geom_point(size=3)+
  scale_color_brewer(palette="Dark2")+
  geom_errorbar(mapping = aes(x = apoe4_doses,ymin=100*paf_est,ymax=100*paf_est_upper),width=0.01)+
  theme_bw()+
  theme(legend.position = "none")+
  labs(y="PAF (%)",x="APOE genotype")+
  ggrepel::geom_text_repel(nudge_x = 0.1)

# repeat with E2/E2 and E2/E3 combined reference
apoe_haps = apoe_haps %>% mutate(apoe_allele_for_paf = ifelse(
  apoe4_doses %in% c("E2/E2","E3/E2"), "reference",as.character(apoe4_doses)
))

case_counts = apoe_haps %>% 
  group_by(apoe_allele_for_paf) %>% 
  dplyr::count(dementia_status) %>% 
  filter(dementia_status == "Case") %>% 
  ungroup() %>%
  mutate(case_fraction = n/sum(n))
geno_prev = apoe_haps %>% 
  dplyr::count(apoe_allele_for_paf) %>% 
  mutate(prev = n/sum(n))

apoe_haps$apoe_allele_for_paf = relevel(factor(apoe_haps$apoe_allele_for_paf),ref="reference")
pafs = glm(data = apoe_haps %>%
             mutate(dementia_status = ifelse(dementia_status == "Case",1,0)),
           dementia_status ~ S1QST_Gender + AgeAtRecruitment + pop.GSA.51k + 
             PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 + 
             apoe_allele_for_paf,
           family=binomial(link="logit")) %>% 
  broom::tidy() %>% 
  filter(grepl("apoe",term) | term=="reference") %>%
  mutate(apoe_allele_for_paf = str_remove_all(term,"apoe_allele_for_paf")) %>%
  mutate(or = exp(estimate)) %>%
  mutate(lower = exp(estimate - 1.96*std.error)) %>%
  mutate(upper = exp(estimate + 1.96*std.error)) %>%
  left_join(case_counts,by="apoe_allele_for_paf") %>%
  left_join(geno_prev %>% mutate(total = n),by="apoe_allele_for_paf") %>% 
  mutate(paf_est = (or-1)/or*case_fraction) %>%
  mutate(paf_est_lower = (lower-1)/lower*case_fraction) %>%
  mutate(paf_est_upper = (upper-1)/upper*case_fraction) %>% 
  dplyr::select(apoe_allele_for_paf,contains("paf"))
pafs = pafs %>% bind_rows(
pafs %>% dplyr::select(-1) %>% colSums())
pafs$apoe_allele_for_paf[5]="Combined"



ggplot(pafs,aes(apoe_allele_for_paf,
                100*paf_est,
                col=apoe_allele_for_paf!="Combined",
       label = paste0(round(paf_est,3)*100,"%")))+
  geom_point(show.legend = F)+
  geom_errorbar(mapping = aes(x = apoe_allele_for_paf,ymin=100*paf_est_lower,ymax=100*paf_est_upper),width=0.01,show.legend = F)+
  theme_bw()+
  labs(y="PAF (%)",x="APOE genotype")+
  scale_color_brewer(palette="Set1")+
  theme(panel.grid = element_blank())+
  ggrepel::geom_text_repel(show.legend = F)
pafs



# repeat for just E4
# survival curves
km_fit = survfit(
  Surv(survtime,status) ~ apoe4_alleles, 
  data = apoe_haps
)
p0 = ggsurvplot(km_fit,
           fun = "pct",
           palette = "Set1",conf.int = F,pval = T,
           ylab = "Probability of dementia-free survival",
           xlab="Age (years)",
           legend.labs = c("APOE E4 -/-","APOE E4 +/-","APOE E4 +/+"),
           legend=c(0.3,0.7)
           )

p0 = p0$plot+
  scale_x_continuous(labels = function(x) x + 16,breaks = c(12,22,32,42,52,62,72))+
  geom_hline(yintercept=50,linetype="dashed",alpha=0.3)
p0


# repeat for other alleles 
# survival curves
km_fit = survfit(
  Surv(survtime,status) ~ apoe4_doses, 
  data = apoe_haps
)
p99 = ggsurvplot(km_fit,
                fun = "pct",
                palette = "Set1",conf.int = F,pval = T,
                ylab = "Probability of dementia-free survival",
                xlab="Age (years)",
                legend=c(0.3,0.7)
)

p99 = p99$plot+
  scale_x_continuous(labels = function(x) x + 16,breaks = c(12,22,32,42,52,62,72))+
  geom_hline(yintercept=50,linetype="dashed",alpha=0.3)



coxph(
  Surv(survtime,status) ~ apoe4_alleles,
  data = apoe_haps
) %>% summary()


# model 
# set ref 
apoe_haps$apoe4_doses = relevel(factor(apoe_haps$apoe4_doses),ref="E3/E3")
model = coxph(
  Surv(survtime,status) ~ S1QST_Gender + AgeAtRecruitment + pop.GSA.51k + 
    PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 + 
    apoe4_doses,
  data = apoe_haps %>% filter(pop.GSA.51k != "Ambiguous")
) %>% broom::tidy()

model$std.error
plot_dat = model %>% filter(grepl("apoe",term)) %>% 
  mutate(term = str_remove_all(term,"apoe4_doses"))
plot_dat$p_simple = sapply(plot_dat$p.value,function(x){
if(x<0.0001){
  "<0.0001"
} else if(x<0.001){
  "<0.001"
} else if(x<0.01){
  "<0.01"
} else {
  ">0.01"
}
})

plot_dat$label = paste0("HR = ",
                        round(exp(plot_dat$estimate),1),"; P",plot_dat$p_simple)
plot_dat$lower = exp(plot_dat$estimate-1.96*plot_dat$std.error)
plot_dat$upper = exp(plot_dat$estimate+1.96*plot_dat$std.error)
plot_dat
p1 = ggplot(plot_dat,
       aes(exp(estimate),term,label=label))+
  geom_point()+
  geom_errorbarh(mapping = aes(
                   xmin = exp(estimate - 1.96*std.error),
                   xmax = exp(estimate + 1.96*std.error)),
                 height=0.01,alpha=0.4
                 )+
  theme_bw()+
  labs(x="Hazard Ratio for dementia\nrelative to APOE 3/3 genotype",y="APOE genotype")+
  geom_vline(xintercept=1,linetype="dashed",alpha=0.3)+
  ggrepel::geom_text_repel(nudge_y=0.1)+
  scale_x_log10()
  

png("/home/ivm/apoe/export/survival_plots.png",res=900,units="in",width=8,height=5)
cowplot::plot_grid(p0,p1)
dev.off()

Sensitivity analysis

# repeat with controls >60 
controls_over_60_dat = apoe_haps %>% filter(AgeAtRecruitment>60 & (is.na(age_at_event) | age_at_event>60 ) ) 
controls_over_60_dat%>%
  dplyr::count(dementia_status)
controls_over_60_dat%>%
  group_by(dementia_status) %>%
  summarise_at(.vars = vars(AgeAtRecruitment,age_at_event),.funs = c(median,IQR),na.rm=T)

controls_over_60_dat%>%
  group_by(dementia_status) %>%
  dplyr::count(S1QST_Gender) %>% 
  mutate(prop = n/sum(n))

controls_over_60_dat%>%
  group_by(dementia_status) %>%
  dplyr::count(apoe4_doses) %>% 
  mutate(prop = n/sum(n)) %>% 
  pivot_wider(values_from = prop,names_from = apoe4_doses,id_cols = dementia_status)

controls_over_60_dat = controls_over_60_dat %>% 
  mutate(age_at_data_extract = as.numeric(as.Date("2024-01-01",format="%Y-%m-%d") - s1qst_dob)/365.25) %>%
  mutate(status = ifelse(dementia_status == "Control",0,1)) %>% 
  mutate(survtime = ifelse(dementia_status == "Control",age_at_data_extract-60,age_at_event-60))  

model = coxph(
  Surv(survtime,status) ~ S1QST_Gender + AgeAtRecruitment + pop.GSA.51k + 
    PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 + 
    apoe4_doses,
  data = controls_over_60_dat %>% filter(pop.GSA.51k != "Ambiguous")
) %>% broom::tidy()

plot_dat = model %>% filter(grepl("apoe",term)) %>% 
  mutate(term = str_remove_all(term,"apoe4_doses"))
plot_dat$p_simple = sapply(plot_dat$p.value,function(x){
  if(x<0.0001){
    "<0.0001"
  } else if(x<0.001){
    "<0.001"
  } else if(x<0.01){
    "<0.01"
  } else {
    ">0.01"
  }
})

plot_dat$label = paste0("HR = ",
                        round(exp(plot_dat$estimate),1),"; P",plot_dat$p_simple)
plot_dat$lower = exp(plot_dat$estimate-1.96*plot_dat$std.error)
plot_dat$upper = exp(plot_dat$estimate+1.96*plot_dat$std.error)
plot_dat %>% dplyr::select(-2,-3,-4)



# just e4
coxph(
  Surv(survtime,status) ~  S1QST_Gender+AgeAtRecruitment + pop.GSA.51k + 
    PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 + 
    apoe4_alleles,
  data = apoe_haps %>% filter(pop.GSA.51k != "Ambiguous")
) %>% broom::tidy() %>% 
  filter(grepl("apoe4_alleles",term)) %>% 
  mutate(HR = exp(estimate)) %>%
  mutate(lowerci = exp(estimate - 1.96* std.error)) %>%
  mutate(upperci = exp(estimate + 1.96* std.error))
  
# just e4
coxph(
  Surv(survtime,status) ~  S1QST_Gender+AgeAtRecruitment + pop.GSA.51k + 
    PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 + 
    apoe4_status,
  data = apoe_haps %>% mutate(apoe4_status = ifelse(apoe4_alleles >0,"yes","no")) %>%
                              filter(pop.GSA.51k != "Ambiguous")) %>% broom::tidy() %>% 
  filter(grepl("apoe4_status",term)) %>% 
  mutate(HR = exp(estimate)) %>%
  mutate(lowerci = exp(estimate - 1.96* std.error)) %>%
  mutate(upperci = exp(estimate + 1.96* std.error))



# gender-stratified 
apoe_haps$gender = ifelse(apoe_haps$S1QST_Gender==1,"M","F")
km_fit = survfit(
  Surv(survtime,status) ~ apoe4_alleles, 
  data = apoe_haps
)
p0 = ggsurvplot_facet(km_fit,data=apoe_haps,
                 facet.by = "gender",
                 legend.labs = c("APOE E4 -/-","APOE E4 +/-","APOE E4 +/+"),
                 fun = "pct",
                 ylab = "Probability of dementia-free survival",
                 xlab="Age (years)",
                 palette = "Set1",
                 legend=c(0.1,0.7),
                 conf.int = F
                 
) +
  scale_x_continuous(labels = function(x) x + 16,breaks = c(12,22,32,42,52,62,72))+
  geom_hline(yintercept=50,linetype="dashed",alpha=0.3)

p0
coxph(
  Surv(survtime,status) ~  +AgeAtRecruitment + pop.GSA.51k + 
    PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 + 
    apoe4_doses,
  data = apoe_haps %>% filter(pop.GSA.51k != "Ambiguous" & S1QST_Gender == "1")
) %>% broom::tidy() %>% 
  filter(grepl("apoe4_doses",term)) %>% 
  mutate(HR = exp(estimate)) %>% 
  mutate(lowerci = exp(estimate - 1.96* std.error)) %>%
  mutate(upperci = exp(estimate + 1.96* std.error))

# repeat with AD dementia

apoe_haps$apoe4_doses = relevel(factor(apoe_haps$apoe4_doses),ref="E3/E3")

coxph(
  Surv(survtime,status) ~  S1QST_Gender+AgeAtRecruitment + pop.GSA.51k + 
    PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 + 
    apoe4_doses,
  data = apoe_haps %>% filter(pop.GSA.51k != "Ambiguous")
) %>% broom::tidy() %>% 
  filter(grepl("apoe4_doses",term)) %>% 
  mutate(HR = exp(estimate)) %>% 
  mutate(lowerci = exp(estimate - 1.96* std.error)) %>%
  mutate(upperci = exp(estimate + 1.96* std.error))

coxph(
  Surv(survtime,status) ~  +AgeAtRecruitment + pop.GSA.51k + 
    PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 + 
    apoe4_doses,
  data = apoe_haps %>% filter(pop.GSA.51k != "Ambiguous" & S1QST_Gender == "2")
) %>% broom::tidy() %>% 
  filter(grepl("apoe4_doses",term)) %>% 
  mutate(HR = exp(estimate)) %>% mutate(lowerci = exp(estimate - 1.96* std.error)) %>%
  mutate(upperci = exp(estimate + 1.96* std.error))


# interaction 
coxph(
  Surv(survtime,status) ~  +AgeAtRecruitment + pop.GSA.51k + 
    PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 + 
    apoe4_alleles * S1QST_Gender,
  data = apoe_haps %>% filter(pop.GSA.51k != "Ambiguous")
) %>% broom::tidy() %>% 
  filter(grepl("apoe4_alleles",term)) %>% 
  mutate(HR = exp(estimate)) %>% 
  mutate(lowerci = exp(estimate - 1.96* std.error)) %>%
  mutate(upperci = exp(estimate + 1.96* std.error))

ggplot(apoe_haps,aes(AgeAtRecruitment,age_at_event))+geom_point()+
  geom_abline(intercept=0,slope=1)

hist(apoe_haps$AgeAtRecruitment)

# repeat for ad 
apoe_haps = apoe_haps %>% 
  mutate(age_at_data_extract = as.numeric(as.Date("2024-01-01",format="%Y-%m-%d") - s1qst_dob)/365.25) %>%
  mutate(status = ifelse(ad_status == "Control",0,1)) %>% 
  mutate(survtime = ifelse(ad_status == "Control",age_at_data_extract-16,age_at_event_ad-16))  
model_ad = coxph(
  Surv(survtime,status) ~ S1QST_Gender + AgeAtRecruitment + pop.GSA.51k + 
    PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 + 
    apoe4_doses,
  data = apoe_haps %>% filter(pop.GSA.51k != "Ambiguous")
) %>% broom::tidy() %>% 
  filter(grepl("apoe4",term)) %>%
  mutate(outcome = "AD")
model_ad %>%  mutate(HR = exp(estimate)) %>% 
  mutate(lowerci = exp(estimate - 1.96* std.error)) %>%
  mutate(upperci = exp(estimate + 1.96* std.error)) 
apoe_haps$ad_status %>% table 

km_fit = survfit(
  Surv(survtime,status) ~ apoe4_alleles, 
  data = apoe_haps
)
p0 = ggsurvplot(km_fit,
                fun = "pct",
                palette = "Set1",conf.int = F,pval = T,
                ylab = "Probability of dementia-free survival",
                xlab="Age (years)",
                legend.labs = c("APOE E4 -/-","APOE E4 +/-","APOE E4 +/+"),
                legend=c(0.3,0.7)
)

p0 = p0$plot+
  scale_x_continuous(labels = function(x) x + 16,breaks = c(12,22,32,42,52,62,72))+
  geom_hline(yintercept=50,linetype="dashed",alpha=0.3)

# repeat for VaD
apoe_haps = apoe_haps %>% 
  mutate(age_at_data_extract = as.numeric(as.Date("2024-01-01",format="%Y-%m-%d") - s1qst_dob)/365.25) %>%
  mutate(status = ifelse(vad_status == "Control",0,1)) %>% 
  mutate(survtime = ifelse(vad_status == "Control",age_at_data_extract-16,age_at_event_vad-16))  
model_vad = coxph(
  Surv(survtime,status) ~ S1QST_Gender + AgeAtRecruitment + pop.GSA.51k + 
    PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 + 
    apoe4_doses,
  data = apoe_haps %>% filter(pop.GSA.51k != "Ambiguous")
) %>% broom::tidy() %>% 
  filter(grepl("apoe4",term)) %>%
  mutate(outcome = "VaD")

PheWAS

# custom binary trait phewas
res = list()
pheno_list = list.files("/genesandhealth/library-red/genesandhealth/phenotypes_curated/version010_2025_04/BI_PY/outputs/custom_phenotypes/individual_trait_files/",pattern = "csv")
for(i in c(1:length(pheno_list))){
  phenotype = pheno_list[i]
  
  # read in 
  pheno_dat = read_csv(paste0("/genesandhealth/library-red/genesandhealth/phenotypes_curated/version010_2025_04/BI_PY/outputs/custom_phenotypes/individual_trait_files/",phenotype))
  
  if(nrow(pheno_dat)>500){
  # get short name
  pheno_name = pheno_dat$phenotype[1]
  
  # take earliest 
  pheno_dat = pheno_dat %>% 
    group_by(nhs_number) %>% slice_min(date,n=1,with_ties = F) %>% ungroup()
  
  # join with oragene 
  pheno_dat = pheno_dat %>%
    inner_join(link_file,by="nhs_number")
  
  # join apoe data with phenotype
  pheno_dat = apoe_haps %>%
    mutate(IID = as.numeric(IID)) %>%
    dplyr::select(-phenotype,-age_at_event) %>%
    left_join(pheno_dat,by="IID") %>%
    mutate(pheno_status = ifelse(is.na(age_at_event),"Control","Case"))
  
    # exclude ambiguous ancestry
    pheno_dat = pheno_dat %>% filter(pop.GSA.51k != "Ambiguous")
    
    # set ref 
    pheno_dat$apoe4_doses = relevel(factor(pheno_dat$apoe4_doses),ref="E3/E3")
    model = glm(data = pheno_dat %>%
                  mutate(pheno_status = ifelse(pheno_status == "Case",1,0)),
                pheno_status ~ S1QST_Gender + AgeAtRecruitment + pop.GSA.51k + 
                  PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 + 
                  apoe4_doses,
                family=binomial(link="logit")) %>% 
      broom::tidy() %>% 
      filter(grepl("apoe",term)) %>% 
      mutate(phenotype = pheno_name)
    
  res[[i]] = model  
  }
}

# combine res 
res = do.call("bind_rows",res)


# repeat for quant traits 
res_quant = list()
pheno_list_quant = list.files("/genesandhealth/library-red/genesandhealth/phenotypes_curated/version010_2025_04/QUANT_PY/outputs/individual_trait_files/all/",pattern = "per_individual")
for(i in c(1:length(pheno_list_quant))){
  phenotype = pheno_list_quant[i]
  
  # read in 
  pheno_dat = read_csv(paste0("/genesandhealth/library-red/genesandhealth/phenotypes_curated/version010_2025_04/QUANT_PY/outputs/individual_trait_files/all/",
                                                            phenotype)
                              )
  
  if(nrow(pheno_dat)>5000){
  # get short name
  pheno_name = pheno_dat$trait[1]
  

  # join with oragene 
  pheno_dat = pheno_dat %>%
    dplyr::select(pseudo_nhs_number,median) %>%
    dplyr::rename("nhs_number" = pseudo_nhs_number) %>%
    inner_join(link_file,by="nhs_number")
  
  # join apoe data with phenotype
  pheno_dat = apoe_haps %>%
    mutate(IID = as.numeric(IID)) %>%
    left_join(pheno_dat,by="IID")
  
  # exclude ambiguous ancestry
  pheno_dat = pheno_dat %>% filter(pop.GSA.51k != "Ambiguous")
  
  # set ref 
  pheno_dat$apoe4_doses = relevel(factor(pheno_dat$apoe4_doses),ref="E3/E3")
  
  # rank norm 
  model_dat = pheno_dat %>% filter(!is.na(median))
  model_dat$pheno_norm = RNOmni::RankNorm(model_dat$median)
  model = lm(data = model_dat,
             pheno_norm ~ S1QST_Gender + AgeAtRecruitment + pop.GSA.51k + 
                PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 + 
                apoe4_doses) %>% 
    broom::tidy() %>% 
    filter(grepl("apoe",term)) %>% 
    mutate(phenotype = pheno_name)
  
  res_quant[[i]] = model  
  }
}

# combine binary & quant
# arrange by p 
res_quant =do.call("bind_rows",res_quant)

# combine 
res = bind_rows(res,res_quant)

res = res %>% arrange(p.value)

# bonf 
res$phenotype %>% unique %>% length

# clean 
res$term = str_remove_all(res$term,"apoe4_doses")

# plot 
res$fdr = p.adjust(res$p.value,method="fdr")
res$bonf = p.adjust(res$p.value,method="bonf")
res$direction = ifelse(res$estimate<0,"decreases","increases")

# plot 
res$phenotype = str_replace_all(res$phenotype,"_"," ")
res$phenotype = factor(res$phenotype,ordered=T,levels = unique(res$phenotype))

png("/home/ivm/apoe/export/phewas.png",res=900,units="in",width=8,height=5)
ggplot(res %>% filter(fdr<0.05),aes(phenotype,-log10(p.value),col=term,label=phenotype,shape = direction))+
  geom_point(size=3)+
  theme_bw()+
  scale_color_brewer(palette="Set1")+
  labs(x="Phenotype",y=bquote(-log[10]~P),shape="Effect direction",color="Genotype")+
  coord_flip()+
  geom_hline(yintercept=-log10(0.05/236),alpha=0.1,linetype="dashed")
dev.off()
res$phenotype %>% unique()


ggplot(plot_dat,
       aes(term,phenotype,fill=direction,alpha = -log10(p.value)/max(-log10(sighits$p.value)),label = bonf_simple))+
  geom_tile(color="black")+
  theme_bw()+
  scale_fill_manual(values = c("blue","red"))+
  geom_text(alpha=1,size=5)+
  guides(alpha="none")+
  labs(fill="Direction of effect\n(relative to E3/E3)",x="APOE genotype",y="Phenotype")+
  theme(panel.grid = element_blank())
  
res %>% filter(fdr<0.05)
ggplot(res,aes(phenotype,-log10(p.value),fill=term,label = paste0(term," ",direction," ",phenotype)))+
  geom_point(data=  res %>% filter(fdr < 0.05),shape=21)+
  theme_bw()+
  theme(axis.text.x = element_blank(),axis.line.x = element_blank(),axis.ticks.x = element_blank())+
  ggrepel::geom_text_repel(data = res %>% filter(bonf<0.05),min.segment.length = 0,size=3,nudge_x = 20)