## ----setup--------------------------------------------------------------------
library(ApplyPolygenicScore)

## ----import-vcf---------------------------------------------------------------
vcf.file <- system.file(
    'extdata',
    'HG001_GIAB.vcf.gz',
    package = 'ApplyPolygenicScore',
    mustWork = TRUE
    )

vcf.data <- import.vcf(
    vcf.path = vcf.file,
    info.fields = NULL,
    format.fields = NULL,
    verbose = TRUE
    )

str(vcf.data)

## ----import-pgs---------------------------------------------------------------
pgs.file <- system.file(
    'extdata',
    'PGS003378_hmPOS_GRCh38.txt.gz',
    package = 'ApplyPolygenicScore',
    mustWork = TRUE
    )

pgs.data <- import.pgs.weight.file(
    pgs.weight.path = pgs.file,
    use.harmonized.data = TRUE
    )

str(pgs.data)

## ----import-phenotype---------------------------------------------------------

# Isolating the individual IDs from the VCF data
vcf.individuals <- unique(vcf.data$dat$Indiv)

# Simulating phenotype data
set.seed(123)
phenotype.data <- data.frame(
    Indiv = vcf.individuals,
    continuous.phenotype = rnorm(length(vcf.individuals)),
    binary.phenotype = rbinom(length(vcf.individuals), 1, 0.5)
    )

head(phenotype.data)

## ----convert-pgs-to-bed-------------------------------------------------------

pgs.coordinate.info <- pgs.data$pgs.weight.data

pgs.bed.format <- convert.pgs.to.bed(
    pgs.weight.data = pgs.coordinate.info,
    chr.prefix = TRUE,
    numeric.sex.chr = FALSE,
    slop = 10
    )

head(pgs.bed.format)

## ----merge-pgs-bed------------------------------------------------------------
# convert your PGS weight files with no added slop
pgs.bed1 <- convert.pgs.to.bed(pgs.coordinate.info, slop = 0)

# simulating a second PGS with all coordinates shifted by 20 base pairs.
pgs.bed2 <- pgs.bed1
pgs.bed2$start <- pgs.bed2$start + 20
pgs.bed2$end <- pgs.bed2$end + 20

# Input must be a named list
pgs.bed.list <- list(PGS1 = pgs.bed1, PGS2 = pgs.bed2)

merged.pgs.bed <- combine.pgs.bed(
    pgs.bed.list = pgs.bed.list,
    add.annotation.data = TRUE,
    annotation.column.index = which(colnames(pgs.bed1) == 'rsID') # keep information from the rsID column during merge
    )

str(merged.pgs.bed)

## ----check-pgs-columns--------------------------------------------------------

apply.polygenic.score(
    vcf.data = vcf.data$dat,
    pgs.weight.data = pgs.data$pgs.weight.data,
    phenotype.data = phenotype.data,
    validate.inputs.only = TRUE
    )


## ----apply-pgs----------------------------------------------------------------

pgs.results <- apply.polygenic.score(
    vcf.data = vcf.data$dat,
    pgs.weight.data = pgs.data$pgs.weight.data,
    correct.strand.flips = FALSE, # no strand flip check to avoid warnings
    missing.genotype.method = 'none'
    )

str(pgs.results)

## ----merge-vcf-with-pgs-------------------------------------------------------

merged.vcf.pgs.data <- combine.vcf.with.pgs(
    vcf.data = vcf.data$dat,
    pgs.weight.data = pgs.data$pgs.weight.data
    )

names(merged.vcf.pgs.data)
head(merged.vcf.pgs.data$missing.snp.data)[, 1:6]

## ----allele-matching, eval = FALSE--------------------------------------------
#  
#  # Most strict allele match handling
#  strict.allele.match.result <- apply.polygenic.score(
#      vcf.data = vcf.data$dat,
#      pgs.weight.data = pgs.data$pgs.weight.data,
#      missing.genotype.method = 'none',
#      correct.strand.flips = TRUE,
#      remove.ambiguous.allele.matches = TRUE,
#      remove.mismatched.indels = TRUE
#      );
#  
#  # Less strict allele match handling
#  less.strict.allele.match.result <- apply.polygenic.score(
#      vcf.data = vcf.data$dat,
#      pgs.weight.data = pgs.data$pgs.weight.data,
#      missing.genotype.method = 'none',
#      correct.strand.flips = TRUE,
#      remove.ambiguous.allele.matches = FALSE,
#      remove.mismatched.indels = FALSE
#      );
#  

## ----missing-genotype-methods-------------------------------------------------

all.missing.methods.pgs.results <- apply.polygenic.score(
    vcf.data = vcf.data$dat,
    pgs.weight.data = pgs.data$pgs.weight.data,
    correct.strand.flips = FALSE, # no strand flip check to avoid warnings
     missing.genotype.method = c('none', 'normalize', 'mean.dosage')
    )

head(all.missing.methods.pgs.results$pgs.output)

## ----custom-percentiles-------------------------------------------------------

custom.percentiles.pgs.results <- apply.polygenic.score(
    vcf.data = vcf.data$dat,
    pgs.weight.data = pgs.data$pgs.weight.data,
    correct.strand.flips = FALSE, # no strand flip check to avoid warnings
    n.percentiles = 5
    )

head(custom.percentiles.pgs.results$pgs.output)


## ----load-large-vcf-----------------------------------------------------------

phenotype.test.data.path <- system.file(
    'extdata',
    'phenotype.test.data.Rda',
    package = 'ApplyPolygenicScore',
    mustWork = TRUE
    )

load(phenotype.test.data.path)
str(phenotype.test.data)


## ----phenotype-merge----------------------------------------------------------

pgs.results.with.phenotype <- apply.polygenic.score(
    vcf.data = phenotype.test.data$vcf.data,
    pgs.weight.data = phenotype.test.data$pgs.weight.data,
    phenotype.data = phenotype.test.data$phenotype.data
    )

head(pgs.results.with.phenotype$pgs.output)

## ----phenotype-analysis-------------------------------------------------------

pgs.results.with.phenotype.analysis <- apply.polygenic.score(
    vcf.data = phenotype.test.data$vcf.data,
    pgs.weight.data = phenotype.test.data$pgs.weight.data,
    phenotype.data = phenotype.test.data$phenotype.data,
    phenotype.analysis.columns = c('continuous.phenotype', 'binary.phenotype')
    )

head(pgs.results.with.phenotype.analysis$regression.output)


## ----plotting-dir, echo = FALSE-----------------------------------------------
temp.dir <- tempdir();

basic.density.filename <- ApplyPolygenicScore:::generate.filename(
    project.stem = 'vignette-example-basic',
    file.core = 'pgs-density',
    extension = 'png'
    );

phenotype.density.filename <- ApplyPolygenicScore:::generate.filename(
    project.stem = 'vignette-example-phenotype',
    file.core = 'pgs-density',
    extension = 'png'
    );

correlation.filename <- ApplyPolygenicScore:::generate.filename(
    project.stem = 'vignette-example-correlation',
    file.core = 'pgs-scatter',
    extension = 'png'
    );

basic.rank.filename <- ApplyPolygenicScore:::generate.filename(
    project.stem = 'vignette-example-basic',
    file.core = 'pgs-rank-plot',
    extension = 'png'
    );

phenotype.rank.filename <- ApplyPolygenicScore:::generate.filename(
    project.stem = 'vignette-example-phenotype',
    file.core = 'pgs-rank-plot',
    extension = 'png'
    );


## ----pgs-density, eval = TRUE-------------------------------------------------

create.pgs.density.plot(
    pgs.data = pgs.results.with.phenotype.analysis$pgs.output,
    output.dir = temp.dir,
    filename.prefix = 'vignette-example-basic',
    file.extension = 'png'
    )


## ----out.width = '50%', echo = FALSE------------------------------------------
knitr::include_graphics(file.path(temp.dir, basic.density.filename));

## ----pgs-density-phenotype, eval = TRUE---------------------------------------

create.pgs.density.plot(
    pgs.data = pgs.results.with.phenotype.analysis$pgs.output,
    output.dir = temp.dir,
    filename.prefix = 'vignette-example-phenotype',
    file.extension = 'png',
    tidy.titles = TRUE,
    phenotype.columns = c('binary.factor.phenotype', 'categorical.phenotype', 'continuous.phenotype')
    )


## ----out.width = '50%', echo = FALSE------------------------------------------
knitr::include_graphics(file.path(temp.dir, phenotype.density.filename));

## ----pgs-correlation, eval = TRUE---------------------------------------------

create.pgs.with.continuous.phenotype.plot(
    pgs.data = pgs.results.with.phenotype.analysis$pgs.output,
    output.dir = temp.dir,
    filename.prefix = 'vignette-example-correlation',
    file.extension = 'png',
    tidy.titles = TRUE,
    phenotype.columns = c('continuous.phenotype', 'categorical.phenotype')
    )


## ----out.width = '70%', echo = FALSE------------------------------------------
knitr::include_graphics(file.path(temp.dir, correlation.filename));

## ----pgs-rank, eval = TRUE----------------------------------------------------

# Introducing some missing genotypes to demonstrate the missing genotype barplot
pgs.results.with.phenotype.analysis$pgs.output$n.missing.genotypes <- rep(c(0, 1, 0, 2, 1), 2)

create.pgs.rank.plot(
    pgs.data = pgs.results.with.phenotype.analysis$pgs.output,
    output.dir = temp.dir,
    filename.prefix = 'vignette-example-basic',
    file.extension = 'png'
    )


## ----out.width = '50%', echo = FALSE------------------------------------------
knitr::include_graphics(file.path(temp.dir, basic.rank.filename));

## ----pgs-rank-phenotype, eval = TRUE------------------------------------------

create.pgs.rank.plot(
    pgs.data = pgs.results.with.phenotype.analysis$pgs.output,
    output.dir = temp.dir,
    filename.prefix = 'vignette-example-phenotype',
    file.extension = 'png',
    phenotype.columns = c('binary.factor.phenotype', 'binary.phenotype', 'categorical.phenotype', 'continuous.phenotype')
    )


## ----out.width = '50%', echo = FALSE------------------------------------------
knitr::include_graphics(file.path(temp.dir, phenotype.rank.filename));