Methodology

An in-depth overview of each step in the workflow.

Table of contents

Rule Map/Diagram
Workflow Rules Explained

Rule Map/Diagram

---
title: Pharmacogenetics Analysis
config:
    layout: elk
---
flowchart TD
    START(((Input Files)))
    click START href "/workflow/data" _blank

    ValidateVcfWorkflow[\Validate VCF Workflow/]
    click ValidateVcfWorkflow href "https://tuks-icmm.github.io/VCF-Validation-Workflow/workflow/methodology" _blank

    START --> ValidateVcfWorkflow --> multipleVcfProtocol
    START --> format_sample_metadata

    subgraph pharmacogeneticsWorkflow [Pharmacogenetics Workflow]
        direction BT

        classDef bcftools stroke:#FF5733,fill:#D3D3D3,stroke-width:4px,color:black;
        classDef plink stroke:#36454F,fill:#D3D3D3,stroke-width:4px,color:black;
        classDef python stroke:#FEBE10,fill:#D3D3D3,stroke-width:4px,color:black;
        classDef admixture stroke:#333,fill:#D3D3D3,stroke-width:4px,color:black;
        classDef tabix stroke:#023020,fill:#D3D3D3,stroke-width:4px,color:black;
        classDef gatk stroke:#007FFF,fill:#D3D3D3,stroke-width:4px,color:black;
        classDef workflow stroke:#fff,fill:#000000,stroke-width:4px,color:white;

        subgraph multipleVcfProtocol [Multiple dataset protocol]
            direction LR
            multipleVcfProtocolStart(((Start)))
            ifMergeRequired{Is a <br>merge needed?}
            merge_datasets[[<b>merge_datasets</b>: <br>Merge multiple incoming <br>datasets]]

            normalize_merged_datasets[[<b>normalize_merged_datasets</b>: <br>normalize any multi-allelic <br>records created by merge]]
            multipleVcfProtocolEnd(((End)))

            class merge_datasets,normalize_merged_datasets bcftools;
            multipleVcfProtocolStart --> ifMergeRequired
            ifMergeRequired --> |yes| merge_datasets --> normalize_merged_datasets --> multipleVcfProtocolEnd
            ifMergeRequired --> |No| multipleVcfProtocolEnd
        end

        format_sample_metadata[[<b>format_sample_metadata</b>: <br>Transpile cluster ownership <br>from sample cluster assignment <br>into input format]]

        convert_to_pgen[[<b>convert_to_pgen</b>: <br>Convert the VCF fields <br>into Plink-2 binary <br>format]]

        verify_records_against_reference_genome[[<b>verify_records_against_reference_genome</b>: <br>Check reference alleles against <br>provided reference genome]]

        remove_non_standard_chromosomes[[<b>remove_non_standard_chromosomes</b>: <br>Filter out non-standard <br>chromosomes]]
        
        remove_unknown_samples[[<b>remove_unknown_samples</b>: <br>Subset samples to labeled <br>samples in metadata files]]
        
        filter_variant_missingness[[<b>filter_variant_missingness</b>: <br>Filter variants with 100% <br>missingness]]
        
        filter_sample_missingness[[<b>filter_sample_missingness</b>: <br>Filter samples with 100% <br>missingness]]

        calculate_sample_relatedness[[<b>calculate_sample_relatedness</b>: <br>Calculate relatedness]]

        remove_related_samples[[<b>remove_related_samples</b>: <br>remove a given list of <br>samples which are too <br>closely related]]

        extract_provided_coordinates[[<b>extract_provided_coordinates</b>: <br>Trim the dataset to one <br>of the studied regions]]

        subgraph variant_count[Variant Count]
            variant_count_START(((Start)))
            variant_count_END(((End)))
            report_count_partitioned_per_cluster[[<b>report_count_partitioned_per_cluster</b>: <br>Perform frequency analysis <br>across each cluster]]
            collect_variant_frequency[[<b>collect_variant_frequency</b>: <br>Collect cluster-level <br>variant frequency reports <br>into one]]
            collect_variant_count[[<b>collect_variant_count</b>: <br>Collect all per-cluster <br>variant count reports <br>into one]]

            variant_count_START --> report_count_partitioned_per_cluster & collect_variant_frequency & collect_variant_count
            report_count_partitioned_per_cluster --> collect_variant_frequency
            report_count_partitioned_per_cluster --> collect_variant_count

            collect_variant_count & collect_variant_frequency --> variant_count_END
        end

        subgraph hardy_weinberg[Hardy-Weinberg]
            hardy_weinberg_START(((Start)))
            hardy_weinberg_END(((End)))
            report_hardy_weinberg_per_cluster[[<b>report_hardy_weinberg_per_cluster</b>: <br>Perform HWE analysis <br>across each cluster]]
            collect_autosomal_hardy_weinberg[[<b>collect_autosomal_hardy_weinberg</b>: <br>Collect the HWE reports <br>for autosomal locations]]

            hardy_weinberg_START --> report_hardy_weinberg_per_cluster & collect_autosomal_hardy_weinberg
            report_hardy_weinberg_per_cluster --> collect_autosomal_hardy_weinberg
            collect_autosomal_hardy_weinberg --> hardy_weinberg_END
        end

        subgraph variant_missingness[Variant Missingness]
            variant_missingness_START(((Start)))
            variant_missingness_END(((End)))
            report_missingness_per_cluster[[<b>report_missingness_per_cluster</b>: <br>Report the missingness rates <br>observed across each cluster]]

            collect_variant_missingness[[<b>collect_variant_missingness</b>: <br>Collect all per-cluster <br>variant missingness reports <br>into one]]
            variant_missingness_START --> report_missingness_per_cluster & collect_variant_missingness
            report_missingness_per_cluster --> collect_variant_missingness
            collect_variant_missingness --> variant_missingness_END
        end

        subgraph vep[Variant Effect Prediction]
            vep_START(((Start)))
            vep_END(((End)))
            query_variant_effect_predictions[[<b>query_variant_effect_predictions</b>: <br>Perform API calls to <br>E! Ensemble REST API to <br>identify variants]]
            compile_variant_effect_predictions[[<b>compile_variant_effect_predictions</b>: <br>Collect the RAW API <br>payloads and extract <br>relevant metrics]]
            
            vep_START --> query_variant_effect_predictions & compile_variant_effect_predictions
            query_variant_effect_predictions --> compile_variant_effect_predictions
            compile_variant_effect_predictions --> vep_END
        end

        subgraph fishers_exact[Fishers Exact w/t Bonferoni Correction]
            fishers_exact_START(((Start)))
            fishers_exact_END(((End)))
            report_fishers_exact_with_corrections[[<b>report_fishers_exact_with_corrections</b>: <br>Perform Fishers-Exact test <br>with Bonferonni correction]]

            fishers_exact_START --> report_fishers_exact_with_corrections --> fishers_exact_END
        end






        consolidate_reports[[<b>consolidate_reports</b>: <br>Consolidate all the <br>generated reports <br>into one]]

        class format_sample_metadata,query_variant_effect_predictions,compile_variant_effect_predictions,collect_autosomal_hardy_weinberg,collect_variant_count,collect_variant_frequency,collect_variant_missingness,report_fishers_exact_with_corrections,consolidate_reports python;

        class convert_to_pgen,verify_records_against_reference_genome,filter_variant_missingness,filter_sample_missingness,remove_non_standard_chromosomes,remove_unknown_samples,calculate_sample_relatedness,remove_related_samples,extract_provided_coordinates,report_count_partitioned_per_cluster,report_hardy_weinberg_per_cluster,report_missingness_per_cluster plink;

        remove_related_samples --> extract_provided_coordinates
        format_sample_metadata --> extract_provided_coordinates 
        
        
        multipleVcfProtocol --> convert_to_pgen
        format_sample_metadata --> convert_to_pgen
        convert_to_pgen --> verify_records_against_reference_genome
        verify_records_against_reference_genome --> remove_non_standard_chromosomes
        remove_non_standard_chromosomes --> remove_unknown_samples
        remove_unknown_samples --> filter_variant_missingness
        filter_variant_missingness --> filter_sample_missingness
        

        filter_sample_missingness --> calculate_sample_relatedness --> remove_related_samples
        filter_sample_missingness --> remove_related_samples

        query_variant_effect_predictions --> compile_variant_effect_predictions


        extract_provided_coordinates --> variant_count & hardy_weinberg & vep & variant_missingness & fishers_exact

        variant_count & hardy_weinberg & vep & variant_missingness & fishers_exact --> consolidate_reports
    end


    PopulationStructureWorkflow[\Population Structure Workflow/]
    click PopulationStructureWorkflow href "https://tuks-icmm.github.io/Population-Structure-Workflow/workflow/methodology" _blank

    class PopulationStructureWorkflow,ValidateVcfWorkflow workflow;

    remove_related_samples --> PopulationStructureWorkflow


    END((Results))

    consolidate_reports --> END

    PopulationStructureWorkflow --> END

Workflow Rules Explained

Below, each rule in the workflow has been broken down and explained for convenience. In teh case of nested protocols, these are summarized and explained below in nested dropdowns.

format_sample_metadata

  flowchart TD
    format_sample_metadata[[<b>format_sample_metadata</b>: <br>Transpile cluster ownership <br>from sample cluster assignment <br>into input format]]

    classDef python stroke:#FEBE10,fill:#D3D3D3,stroke-width:4px,color:black;
    class format_sample_metadata python;

A python script which uses Pandas to convert the provided samples.csv into a format suitable for Plink-2.

merge protocol *

  flowchart TD
    subgraph multipleVcfProtocol [Multiple dataset protocol]
            direction LR
            multipleVcfProtocolStart(((Start)))
            ifMergeRequired{Is a <br>merge needed?}
            merge_datasets[[<b>merge_datasets</b>: <br>Merge multiple incoming <br>datasets]]

            normalize_merged_datasets[[<b>normalize_merged_datasets</b>: <br>normalize any multi-allelic <br>records created by merge]]
            multipleVcfProtocolEnd(((End)))

            class merge_datasets,normalize_merged_datasets bcftools;
            multipleVcfProtocolStart --> ifMergeRequired
            ifMergeRequired --> |yes| merge_datasets --> normalize_merged_datasets --> multipleVcfProtocolEnd
            ifMergeRequired --> |No| multipleVcfProtocolEnd

            classDef bcftools stroke:#FF5733,fill:#D3D3D3,stroke-width:4px, color:black;
            class merge_datasets,normalize_merged_datasets bcftools;
        end

The merge rule is protected by a decision tree and only executes when multiple described datasets are detected. This rule is responsible for merging multiple datasets into a single VCF file, suitable for collective analysis.

merge_datasets

  flowchart TD
      merge_datasets[[<b>merge_datasets</b>: <br>Merge multiple incoming <br>datasets]]
      
      classDef bcftools stroke:#FF5733,fill:#D3D3D3,stroke-width:4px,color:black;
      class merge_datasets bcftools;

Function

Merge multiple incoming VCF files into a single VCF file.

Command

bcftools merge -O z -o {output} {input.vcf}

Parameters

-O z: Output format (-Oz denotes a BG-Zipped VCF output)
-o {output.vcf}: Output file.

normalize_merged_datasets

  flowchart TD
      normalize_merged_datasets[[<b>normalize_merged_datasets</b>: <br>normalize any multi-allelic <br>records created by merge]]
              
      normalize_merged_datasets[[**normalize_merged_datasets**: normalize any multi-allelic records created by merge]]

      classDef bcftools stroke:#FF5733,fill:#D3D3D3,stroke-width:4px,color:black;
      class normalize_merged_datasets bcftools;

Function

To normalize any multi-allelic records which may arise as a result of the merge.

Command

bcftools norm --multiallelics - -any -Oz -o {output} {input}

Parameters

--multiallelics -: Split multi-allelic records into bi-allelic records.
-O z: Output format (-Oz denotes a BG-Zipped VCF output).
-o {output.vcf}: Output file.

convert_to_pgen

flowchart TD
    convert_to_pgen[[<b>convert_to_pgen</b>: <br>Convert the VCF fields <br>into Plink-2 binary <br>format]]

    classDef plink stroke:#36454F,fill:#D3D3D3,stroke-width:4px,color:black;
    class convert_to_pgen plink;

Function

To convert text-based VCF files into binary Plink-2 PGEN format which is much more performant.

command

plink2 --threads {threads} --vcf {input.vcf} --update-sex {input.sample_metadata} --split-par hg38 --allow-extra-chr --make-pgen vzs --out {params.output}

Parameters

--threads {threads}: Used to set the number of CPU threads used during this calculation
--vcf {input.vcf}: Used to provide the location of the input file plink needs to work on.
--update-sex {input.sample_metadata}: Used to provide plink with sample annotations for downstream processing and use.
--split-par hg38: Used to indicate to plink to use standard coordinates for build hg38 when trimming off the PAR regions on chromosome X.
--allow-extra-chr: Used to indicate to Plink to expect non-standard chromosomes.
--make-pgen vzs: Used to indicate that output should be compiled in Plink-2 binary format (.pvar, .pgen and .psam files).
--out {params.output}: Used to declare the output location that should be used to create the file. File is specified without extension, which is added by Plink.

verify_records_against_reference_genome

  flowchart TD
    verify_records_against_reference_genome[[<b>verify_records_against_reference_genome</b>: <br>Check reference alleles against <br>provided reference genome]]

    classDef plink stroke:#36454F,fill:#D3D3D3,stroke-width:4px,color:black;
    class verify_records_against_reference_genome plink;

Function

To check each loci and comparing its listed reference to that provided in the reference genome.

Command

plink2 --threads {threads} --pfile {params.input} vzs --fa {params.ref} --ref-from-fa force --allow-extra-chr --make-pgen vzs --out {params.output}

Parameters

--threads {threads}: Used to set the number of CPU threads used during this calculation
--pfile {params.input} vzs: Used to provide plink with the location of a plink-2 binary file set (.psam, .pvar and .pgen files), and to expect z-compressed files.
--fa {params.ref}: File path to reference genome to be used for comparison.
--ref-from-fa force: Sets REF allele to provided reference FASTA when possible unambiguously (Does not apply to some INDELS)
--threads {threads}: Used to set the number of CPU threads used during this calculation
--pfile {params.input} vzs: Used to provide plink with the location of a plink-2 binary file set (.psam, .pvar and .pgen files), and to expect z-compressed files.
--make-pgen zs: Save output to a BG-Zipped pgen binary fileset.
--out {params.output}: Provide the file name and path for output creation.

remove_non_standard_chromosomes

  flowchart TD
    remove_non_standard_chromosomes[[<b>remove_non_standard_chromosomes</b>: <br>Filter out non-standard <br>chromosomes]]

    classDef plink stroke:#36454F,fill:#D3D3D3,stroke-width:4px,color:black;
    class remove_non_standard_chromosomes plink;

Function

To filter out non-standard chromosomes.

Command

plink2 --threads {threads} --pfile {params.input} vzs --allow-extra-chr --output-chr chr26 --chr 1-26 --make-pgen vzs --out {params.output}

Parameters

--threads {threads}: Used to set the number of CPU threads used during this calculation
--pfile {params.input} vzs: Used to provide plink with the location of a plink-2 binary file set (.psam, .pvar and .pgen files), and to expect z-compressed files.
--allow-extra-chr: Used to indicate to Plink to expect non-standard chromosomes.
--output-chr chr26: Sets chromosome code notation in output files to include the 'chr' as a prefix.
--chr 1-26: Request a subset of chromosomes to be included in the output file.
--make-pgen zs: Save output to a BG-Zipped pgen binary fileset.
--out {params.output}: Provide the file name and path for output creation.

remove_unknown_samples

  flowchart TD
    remove_unknown_samples[[<b>remove_unknown_samples</b>: <br>Subset samples to labeled <br>samples in metadata files]]

    classDef plink stroke:#36454F,fill:#D3D3D3,stroke-width:4px,color:black;
    class remove_unknown_samples plink;

Function

To remove unneeded samples. This is done by comparison against all provided sample annotations in the input/samples.csv metadata file.

Command

plink2 --threads {threads} --pfile {params.input} vzs --keep {input.sample_metadata} --make-pgen vzs --out {params.output} >{log} 2>&1

Parameters

--threads {threads}: Used to set the number of CPU threads used during this calculation
--pfile {params.input} vzs: Used to provide plink with the location of a plink-2 binary file set (.psam, .pvar and .pgen files), and to expect z-compressed files.
--keep {input.sample_metadata}: Keep only samples mentioned by name in provided samples.csv
--make-pgen zs: Save output to a BG-Zipped pgen binary fileset.
--out {params.output}: Provide the file name and path for output creation.

filter_variant_missingness

  flowchart TD
    filter_variant_missingness[[<b>filter_variant_missingness</b>: <br>Filter variants with 100% <br>missingness]]

    classDef plink stroke:#36454F,fill:#D3D3D3,stroke-width:4px,color:black;
    class filter_variant_missingness plink;

Function

To manage and remove regions of missing calls along the variant-level.

Command

plink2 --threads {threads} --pfile {params.input} vzs --geno 1.0 --make-pgen vzs --out {params.output}

Parameters

--threads {threads}: Used to set the number of CPU threads used during this calculation
--pfile {params.input} vzs: Used to provide plink with the location of a plink-2 binary file set (.psam, .pvar and .pgen files), and to expect z-compressed files.
--geno 1.0: Filters based on 100% variant missingness
--make-pgen zs: Save output to a BG-Zipped pgen binary fileset.
--out {params.output}: Output file.

filter_sample_missingness

  flowchart TD
    filter_sample_missingness[[<b>filter_sample_missingness</b>: <br>Filter samples with 100% <br>missingness]]

    classDef plink stroke:#36454F,fill:#D3D3D3,stroke-width:4px,color:black;
    class filter_sample_missingness plink;

Function

To manage and remove regions of missing calls along the sample-level.

Command

plink2 --threads {threads} --pfile {params.input} vzs --mind 1.0 --make-pgen vzs --out {params.output}

Parameters

--threads {threads}: Used to set the number of CPU threads used during this calculation
--pfile {params.input} vzs: Used to provide plink with the location of a plink-2 binary file set (.psam, .pvar and .pgen files), and to expect z-compressed files.
--mind 1.0: Filters out samples with more than 100% missingness
--make-pgen zs: Save output to a BG-Zipped pgen binary fileset.
--out {params.output}: Provide the file name and path for output creation.

calculate_sample_relatedness

  flowchart TD
    calculate_sample_relatedness[[<b>calculate_sample_relatedness</b>: <br>Calculate relatedness]]

    classDef plink stroke:#36454F,fill:#D3D3D3,stroke-width:4px,color:black;
    class calculate_sample_relatedness plink;

Function

To calculate sample relatedness using a kingship estimator provided by Plink-2.

Command

plink2 --threads {threads} --pfile {params.input} vzs --king-cutoff 0.354 --out {params.output}

Parameters

--threads {threads}: Used to set the number of CPU threads used during this calculation
--pfile {params.input} vzs: Used to provide plink with the location of a plink-2 binary file set (.psam, .pvar and .pgen files), and to expect z-compressed files.
--king-cutoff 0.354: Filters out samples which show a kingship estimator of greater than 0.354.
--out {params.output}: Provide the file name and path for output creation.

remove_related_samples

  flowchart TD
    calculate_sample_relatedness[[<b>calculate_sample_relatedness</b>: <br>Calculate relatedness]]

    classDef plink stroke:#36454F,fill:#D3D3D3,stroke-width:4px,color:black;
    class remove_related_samples plink;

Function

To filter out all but unrelated samples, given the list of samples to keep from its predecessor rules.

Command

plink2 --threads {threads} --pfile {params.input} vzs --keep {input.unrelated_samples} --make-pgen vzs --out {params.output}

Parameters

--threads {threads}: Used to set the number of CPU threads used during this calculation
--pfile {params.input} vzs: Used to provide plink with the location of a plink-2 binary file set (.psam, .pvar and .pgen files), and to expect z-compressed files.
--keep {input.sample_metadata}: Keep only samples mentioned by name in provided samples.csv
--make-pgen zs: Save output to a BG-Zipped pgen binary fileset.
--out {params.output}: Provide the file name and path for output creation.

extract_provided_coordinates

  flowchart TD
    extract_provided_coordinates[[<b>extract_provided_coordinates</b>: <br>Trim the dataset to one <br>of the studied regions]]

    classDef plink stroke:#36454F,fill:#D3D3D3,stroke-width:4px,color:black;
    class extract_provided_coordinates plink;

Function

To filter out all but unrelated samples, given the list of samples to keep from its predecessor rules.

Command

plink2 --threads {threads} --pfile {params.input} vzs --from-bp {params.fromBP} --to-bp {params.toBP} --chr {params.chr} --make-pgen vzs --out {params.output}

Parameters

--threads {threads}: Used to set the number of CPU threads used during this calculation
--pfile {params.input} vzs: Used to provide plink with the location of a plink-2 binary file set (.psam, .pvar and .pgen files), and to expect z-compressed files.
--from-bp: The start co-ordinates to start trimming from.
--to-bp: The stop coordinates to trim until.
--chr: The chromosome on which the coordinates can be found.
--make-pgen zs: Save output to a BG-Zipped pgen binary fileset.
--out {params.output}: Provide the file name and path for output creation.

Allele Count and Frequency protocol

  ---
  config:
      flowchart:
          defaultRenderer: elk
  ---
  flowchart TD
    subgraph variant_count[Variant Count]
      variant_count_START(((Start)))
      variant_count_END(((End)))
      report_count_partitioned_per_cluster[[<b>report_count_partitioned_per_cluster</b>: <br>Perform frequency analysis <br>across each cluster]]
      collect_variant_frequency[[<b>collect_variant_frequency</b>: <br>Collect cluster-level <br>variant frequency reports <br>into one]]
      collect_variant_count[[<b>collect_variant_count</b>: <br>Collect all per-cluster <br>variant count reports <br>into one]]

      variant_count_START --> report_count_partitioned_per_cluster & collect_variant_frequency & collect_variant_count
      report_count_partitioned_per_cluster --> collect_variant_frequency
      report_count_partitioned_per_cluster --> collect_variant_count

      collect_variant_count & collect_variant_frequency --> variant_count_END


      classDef plink stroke:#36454F,fill:#D3D3D3,stroke-width:4px,color:black;
      class report_count_partitioned_per_cluster plink;

      classDef python stroke:#FEBE10,fill:#D3D3D3,stroke-width:4px,color:black;
      class collect_variant_frequency,collect_variant_count python;
    end

This protocol is responsible for performing clustered variant count and frequency analysis. Two python scripts are used to collect the results of each clusters counts and frequencies respectively, and convert these into named columns for the final output.

report_count_partitioned_per_cluster

  flowchart TD    
    report_count_partitioned_per_cluster[[<b>report_count_partitioned_per_cluster</b>: <br>Perform frequency analysis <br>across each cluster]]

    classDef plink stroke:#36454F,fill:#D3D3D3,stroke-width:4px,color:black;
    class report_count_partitioned_per_cluster plink;

Function

To generate a frequency report.

Command

plink2 --threads {threads} --pfile {params.input} vzs --loop-cats {wildcards.cluster} --freq counts cols=chrom,pos,ref,alt,reffreq,altfreq,nobs --out {params.output}

Parameters

--threads {threads}

Used to set the number of CPU threads used during this calculation

--pfile {params.input} vzs

Used to provide plink with the location of a plink-2 binary file set (.psam, .pvar and .pgen files), and to expect z-compressed files.

--loop-cats {wildcards.cluster}

Perform the requested computation repeAtedly, sampeling across the provided clusters, as assigned via the provided samples.csv

--freq counts cols=chrom,pos,ref,alt,reffreq,altfreq,nobs

Calculate the frequency of each variant record. Include the following columns:

Chromosome
Position
Reference Allele
Alternate allele
Reference allele frequency
Alternate allele frequency
Number of observations

--out {params.output}

Provide the file name and path for output creation.

collect_variant_frequency

  flowchart TD
    collect_variant_frequency[[<b>collect_variant_frequency</b>: <br>Collect cluster-level <br>variant frequency reports <br>into one]]

    classDef python stroke:#FEBE10,fill:#D3D3D3,stroke-width:4px,color:black;
    class collect_variant_frequency python;

Function: A python script to compile multiple cluster-level variant-frequency results into a single report.

collect_variant_count

  flowchart TD
    collect_variant_count[[<b>collect_variant_count</b>: <br>Collect all per-cluster <br>variant count reports <br>into one]]

    classDef python stroke:#FEBE10,fill:#D3D3D3,stroke-width:4px,color:black;
    class collect_variant_count python;

Function: A python script to compile multiple cluster-level variant-count results into a single report.

Hardy-Weinberg Protocol

flowchart TD
subgraph hardy_weinberg[Hardy-Weinberg]
  hardy_weinberg_START(((Start)))
  hardy_weinberg_END(((End)))
  report_hardy_weinberg_per_cluster[[<b>report_hardy_weinberg_per_cluster</b>: <br>Perform HWE analysis <br>across each cluster]]
  collect_autosomal_hardy_weinberg[[<b>collect_autosomal_hardy_weinberg</b>: <br>Collect the HWE reports <br>for autosomal locations]]

  hardy_weinberg_START --> report_hardy_weinberg_per_cluster & collect_autosomal_hardy_weinberg
  report_hardy_weinberg_per_cluster --> collect_autosomal_hardy_weinberg
  collect_autosomal_hardy_weinberg --> hardy_weinberg_END

  classDef plink stroke:#36454F,fill:#D3D3D3,stroke-width:4px,color:black;
  class report_hardy_weinberg_per_cluster plink;

  classDef python stroke:#FEBE10,fill:#D3D3D3,stroke-width:4px,color:black;
  class collect_autosomal_hardy_weinberg python;
end

report_hardy_weinberg_per_cluster

  flowchart TD
    report_hardy_weinberg_per_cluster[[<b>report_hardy_weinberg_per_cluster</b>: <br>Perform HWE analysis <br>across each cluster]]

    classDef plink stroke:#36454F,fill:#D3D3D3,stroke-width:4px,color:black;
    class report_hardy_weinberg_per_cluster plink;

Function

To generate a hardy-weinberg report.

Command

plink2 --threads {threads} --pfile {params.input} vzs --loop-cats {wildcards.cluster} --hardy midp cols=chrom,pos,ref,alt,gcounts,hetfreq,p --out {params.output}

Parameters

--threads {threads}

Used to set the number of CPU threads used during this calculation

--pfile {params.input} vzs

Used to provide plink with the location of a plink-2 binary file set (.psam, .pvar and .pgen files), and to expect z-compressed files.

--loop-cats {wildcards.cluster}

Perform the requested computation repeAtedly, sampeling across the provided clusters, as assigned via the provided samples.csv

--hardy midp cols=chrom,pos,ref,alt,gcounts,hetfreq,p

Perform equilibrium, test on each variant record. Include the following columns:

Chromosome
Position
Reference Allele
Alternate allele
Genotype counts
Heterozygote frequency
P-value

--out {params.output}

Provide the file name and path for output creation.

collect_autosomal_hardy_weinberg

  flowchart TD
    collect_autosomal_hardy_weinberg[[<b>collect_autosomal_hardy_weinberg</b>: <br>Collect the HWE reports <br>for autosomal locations]]

    classDef python stroke:#FEBE10,fill:#D3D3D3,stroke-width:4px,color:black;
    class collect_autosomal_hardy_weinberg python;

Function: A python script to compile multiple cluster-level autosomal hardy-weinberg results into a single report.

Missingness Protocol

  flowchart TD
    subgraph variant_missingness[Variant Missingness]
      variant_missingness_START(((Start)))
      variant_missingness_END(((End)))
      report_missingness_per_cluster[[<b>report_missingness_per_cluster</b>: <br>Report the missingness rates <br>observed across each cluster]]

      collect_variant_missingness[[<b>collect_variant_missingness</b>: <br>Collect all per-cluster <br>variant missingness reports <br>into one]]
      variant_missingness_START --> report_missingness_per_cluster & collect_variant_missingness
      report_missingness_per_cluster --> collect_variant_missingness
      collect_variant_missingness --> variant_missingness_END

      classDef plink stroke:#36454F,fill:#D3D3D3,stroke-width:4px,color:black;
      class report_missingness_per_cluster plink;

      classDef python stroke:#FEBE10,fill:#D3D3D3,stroke-width:4px,color:black;
      class collect_variant_missingness python;
    end

report_missingness_per_cluster

  flowchart TD
    report_missingness_per_cluster[[<b>report_missingness_per_cluster</b>: <br>Report the missingness rates <br>observed across each cluster]]

    classDef plink stroke:#36454F,fill:#D3D3D3,stroke-width:4px,color:black;
    class report_missingness_per_cluster plink;

Function

To generate a missingness report.

Command

plink2 --threads {threads} --pfile {params.input} vzs --loop-cats {wildcards.cluster} --missing zs vcols=chrom,pos,ref,alt,provref,nmiss,nobs,fmiss --out {params.output}

Parameters

--threads {threads}

Used to set the number of CPU threads used during this calculation

--pfile {params.input} vzs

Used to provide plink with the location of a plink-2 binary file set (.psam, .pvar and .pgen files), and to expect z-compressed files.

--loop-cats {wildcards.cluster}

Perform the requested computation repeAtedly, sampeling across the provided clusters, as assigned via the provided samples.csv

--missing zs vcols=chrom,pos,ref,alt,provref,nmiss,nobs,fmiss

Perform missingness calculations on each variant record. Include the following columns:

Chromosome
Position
Reference Allele
Alternate allele
Provisional Reference
Number of missing
Number of observations
Frequency of missingness

--out {params.output}

Provide the file name and path for output creation.

collect_variant_missingness

  flowchart TD
    collect_variant_missingness[[<b>collect_variant_missingness</b>: <br>Collect all per-cluster <br>variant missingness reports <br>into one]]

    classDef python stroke:#FEBE10,fill:#D3D3D3,stroke-width:4px,color:black;
    class collect_variant_missingness python;

Function: A python script to compile multiple cluster-level variant-missingness results into a single report.

Fishers Exact Protocol

  flowchart TD
    subgraph fishers_exact[Fishers Exact w/t Bonferoni Correction]
      fishers_exact_START(((Start)))
      fishers_exact_END(((End)))
      report_fishers_exact_with_corrections[[<b>report_fishers_exact_with_corrections</b>: <br>Perform Fishers-Exact test <br>with Bonferonni correction]]

      fishers_exact_START --> report_fishers_exact_with_corrections --> fishers_exact_END

      classDef python stroke:#FEBE10,fill:#D3D3D3,stroke-width:4px,color:black;
      class report_fishers_exact_with_corrections python;
    end

report_fishers_exact_with_corrections

  flowchart TD
    report_fishers_exact_with_corrections[[<b>report_fishers_exact_with_corrections</b>: <br>Perform Fishers-Exact test <br>with Bonferonni correction]]

    classDef python stroke:#FEBE10,fill:#D3D3D3,stroke-width:4px,color:black;
    class report_fishers_exact_with_corrections python;

Function: A python script to perform corrected Fishers-Exact test (Bonferoni method) to identify significant differences in variant frequency between clusters.

Variant Effect Prediction Protocol

  flowchart TD
    subgraph vep[Variant Effect Prediction]
      vep_START(((Start)))
      vep_END(((End)))
      query_variant_effect_predictions[[<b>query_variant_effect_predictions</b>: <br>Perform API calls to <br>E! Ensemble REST API to <br>identify variants]]
      compile_variant_effect_predictions[[<b>compile_variant_effect_predictions</b>: <br>Collect the RAW API <br>payloads and extract <br>relevant metrics]]
      
      vep_START --> query_variant_effect_predictions & compile_variant_effect_predictions
      query_variant_effect_predictions --> compile_variant_effect_predictions
      compile_variant_effect_predictions --> vep_END

      classDef python stroke:#FEBE10,fill:#D3D3D3,stroke-width:4px,color:black;
      class query_variant_effect_predictions,compile_variant_effect_predictions python;
    end

query_variant_effect_predictions

  flowchart TD
    query_variant_effect_predictions[[<b>query_variant_effect_predictions</b>: <br>Perform API calls to <br>E! Ensemble REST API to <br>identify variants]]

    classDef python stroke:#FEBE10,fill:#D3D3D3,stroke-width:4px,color:black;
    class query_variant_effect_predictions python;

Function: A python script to perform batched API calls to the E! Ensembl REST APIs VEP tool to retrieve and store cached variant annotations.

compile_variant_effect_predictions

  flowchart TD
    compile_variant_effect_predictions[[<b>compile_variant_effect_predictions</b>: <br>Collect the RAW API <br>payloads and extract <br>relevant metrics]]

    classDef python stroke:#FEBE10,fill:#D3D3D3,stroke-width:4px,color:black;
    class compile_variant_effect_predictions python;

Function: A python script to extract relevant metrics from the returned annotations.

consolidate_reports

  flowchart TD
    consolidate_reports[[<b>consolidate_reports</b>: <br>Consolidate all the <br>generated reports <br>into one]]

    classDef python stroke:#FEBE10,fill:#D3D3D3,stroke-width:4px,color:black;
    class consolidate_reports python;

Function: A python script to collect and compile all output reports into a single output file for review.