Methodology

An in-depth overview of each step in the workflow.

Table of contents
  1. Rule Map/Diagram
  2. Workflow Rules Explained

Rule Map/Diagram

---
title: Pharmacogenetics Analysis
config:
    layout: elk
---
flowchart TD
    START(((Input Files)))
    click START href "/workflow/data" _blank

    ValidateVcfWorkflow[\Validate VCF Workflow/]
    click ValidateVcfWorkflow href "https://tuks-icmm.github.io/VCF-Validation-Workflow/workflow/methodology" _blank

    START --> ValidateVcfWorkflow --> multipleVcfProtocol
    START --> format_sample_metadata

    subgraph pharmacogeneticsWorkflow [Pharmacogenetics Workflow]
        direction BT

        classDef bcftools stroke:#FF5733,fill:#D3D3D3,stroke-width:4px,color:black;
        classDef plink stroke:#36454F,fill:#D3D3D3,stroke-width:4px,color:black;
        classDef python stroke:#FEBE10,fill:#D3D3D3,stroke-width:4px,color:black;
        classDef admixture stroke:#333,fill:#D3D3D3,stroke-width:4px,color:black;
        classDef tabix stroke:#023020,fill:#D3D3D3,stroke-width:4px,color:black;
        classDef gatk stroke:#007FFF,fill:#D3D3D3,stroke-width:4px,color:black;
        classDef workflow stroke:#fff,fill:#000000,stroke-width:4px,color:white;

        subgraph multipleVcfProtocol [Multiple dataset protocol]
            direction LR
            multipleVcfProtocolStart(((Start)))
            ifMergeRequired{Is a <br>merge needed?}
            merge_datasets[[<b>merge_datasets</b>: <br>Merge multiple incoming <br>datasets]]

            normalize_merged_datasets[[<b>normalize_merged_datasets</b>: <br>normalize any multi-allelic <br>records created by merge]]
            multipleVcfProtocolEnd(((End)))

            class merge_datasets,normalize_merged_datasets bcftools;
            multipleVcfProtocolStart --> ifMergeRequired
            ifMergeRequired --> |yes| merge_datasets --> normalize_merged_datasets --> multipleVcfProtocolEnd
            ifMergeRequired --> |No| multipleVcfProtocolEnd
        end

        format_sample_metadata[[<b>format_sample_metadata</b>: <br>Transpile cluster ownership <br>from sample cluster assignment <br>into input format]]

        convert_to_pgen[[<b>convert_to_pgen</b>: <br>Convert the VCF fields <br>into Plink-2 binary <br>format]]

        verify_records_against_reference_genome[[<b>verify_records_against_reference_genome</b>: <br>Check reference alleles against <br>provided reference genome]]

        remove_non_standard_chromosomes[[<b>remove_non_standard_chromosomes</b>: <br>Filter out non-standard <br>chromosomes]]
        
        remove_unknown_samples[[<b>remove_unknown_samples</b>: <br>Subset samples to labeled <br>samples in metadata files]]
        
        filter_variant_missingness[[<b>filter_variant_missingness</b>: <br>Filter variants with 100% <br>missingness]]
        
        filter_sample_missingness[[<b>filter_sample_missingness</b>: <br>Filter samples with 100% <br>missingness]]

        calculate_sample_relatedness[[<b>calculate_sample_relatedness</b>: <br>Calculate relatedness]]

        remove_related_samples[[<b>remove_related_samples</b>: <br>remove a given list of <br>samples which are too <br>closely related]]

        extract_provided_coordinates[[<b>extract_provided_coordinates</b>: <br>Trim the dataset to one <br>of the studied regions]]

        subgraph variant_count[Variant Count]
            variant_count_START(((Start)))
            variant_count_END(((End)))
            report_count_partitioned_per_cluster[[<b>report_count_partitioned_per_cluster</b>: <br>Perform frequency analysis <br>across each cluster]]
            collect_variant_frequency[[<b>collect_variant_frequency</b>: <br>Collect cluster-level <br>variant frequency reports <br>into one]]
            collect_variant_count[[<b>collect_variant_count</b>: <br>Collect all per-cluster <br>variant count reports <br>into one]]

            variant_count_START --> report_count_partitioned_per_cluster & collect_variant_frequency & collect_variant_count
            report_count_partitioned_per_cluster --> collect_variant_frequency
            report_count_partitioned_per_cluster --> collect_variant_count

            collect_variant_count & collect_variant_frequency --> variant_count_END
        end

        subgraph hardy_weinberg[Hardy-Weinberg]
            hardy_weinberg_START(((Start)))
            hardy_weinberg_END(((End)))
            report_hardy_weinberg_per_cluster[[<b>report_hardy_weinberg_per_cluster</b>: <br>Perform HWE analysis <br>across each cluster]]
            collect_autosomal_hardy_weinberg[[<b>collect_autosomal_hardy_weinberg</b>: <br>Collect the HWE reports <br>for autosomal locations]]

            hardy_weinberg_START --> report_hardy_weinberg_per_cluster & collect_autosomal_hardy_weinberg
            report_hardy_weinberg_per_cluster --> collect_autosomal_hardy_weinberg
            collect_autosomal_hardy_weinberg --> hardy_weinberg_END
        end

        subgraph variant_missingness[Variant Missingness]
            variant_missingness_START(((Start)))
            variant_missingness_END(((End)))
            report_missingness_per_cluster[[<b>report_missingness_per_cluster</b>: <br>Report the missingness rates <br>observed across each cluster]]

            collect_variant_missingness[[<b>collect_variant_missingness</b>: <br>Collect all per-cluster <br>variant missingness reports <br>into one]]
            variant_missingness_START --> report_missingness_per_cluster & collect_variant_missingness
            report_missingness_per_cluster --> collect_variant_missingness
            collect_variant_missingness --> variant_missingness_END
        end

        subgraph vep[Variant Effect Prediction]
            vep_START(((Start)))
            vep_END(((End)))
            query_variant_effect_predictions[[<b>query_variant_effect_predictions</b>: <br>Perform API calls to <br>E! Ensemble REST API to <br>identify variants]]
            compile_variant_effect_predictions[[<b>compile_variant_effect_predictions</b>: <br>Collect the RAW API <br>payloads and extract <br>relevant metrics]]
            
            vep_START --> query_variant_effect_predictions & compile_variant_effect_predictions
            query_variant_effect_predictions --> compile_variant_effect_predictions
            compile_variant_effect_predictions --> vep_END
        end

        subgraph fishers_exact[Fishers Exact w/t Bonferoni Correction]
            fishers_exact_START(((Start)))
            fishers_exact_END(((End)))
            report_fishers_exact_with_corrections[[<b>report_fishers_exact_with_corrections</b>: <br>Perform Fishers-Exact test <br>with Bonferonni correction]]

            fishers_exact_START --> report_fishers_exact_with_corrections --> fishers_exact_END
        end






        consolidate_reports[[<b>consolidate_reports</b>: <br>Consolidate all the <br>generated reports <br>into one]]

        class format_sample_metadata,query_variant_effect_predictions,compile_variant_effect_predictions,collect_autosomal_hardy_weinberg,collect_variant_count,collect_variant_frequency,collect_variant_missingness,report_fishers_exact_with_corrections,consolidate_reports python;

        class convert_to_pgen,verify_records_against_reference_genome,filter_variant_missingness,filter_sample_missingness,remove_non_standard_chromosomes,remove_unknown_samples,calculate_sample_relatedness,remove_related_samples,extract_provided_coordinates,report_count_partitioned_per_cluster,report_hardy_weinberg_per_cluster,report_missingness_per_cluster plink;

        remove_related_samples --> extract_provided_coordinates
        format_sample_metadata --> extract_provided_coordinates 
        
        
        multipleVcfProtocol --> convert_to_pgen
        format_sample_metadata --> convert_to_pgen
        convert_to_pgen --> verify_records_against_reference_genome
        verify_records_against_reference_genome --> remove_non_standard_chromosomes
        remove_non_standard_chromosomes --> remove_unknown_samples
        remove_unknown_samples --> filter_variant_missingness
        filter_variant_missingness --> filter_sample_missingness
        

        filter_sample_missingness --> calculate_sample_relatedness --> remove_related_samples
        filter_sample_missingness --> remove_related_samples

        query_variant_effect_predictions --> compile_variant_effect_predictions


        extract_provided_coordinates --> variant_count & hardy_weinberg & vep & variant_missingness & fishers_exact

        variant_count & hardy_weinberg & vep & variant_missingness & fishers_exact --> consolidate_reports
    end


    PopulationStructureWorkflow[\Population Structure Workflow/]
    click PopulationStructureWorkflow href "https://tuks-icmm.github.io/Population-Structure-Workflow/workflow/methodology" _blank

    class PopulationStructureWorkflow,ValidateVcfWorkflow workflow;

    remove_related_samples --> PopulationStructureWorkflow


    END((Results))

    consolidate_reports --> END

    PopulationStructureWorkflow --> END

Workflow Rules Explained

Below, each rule in the workflow has been broken down and explained for convenience. In teh case of nested protocols, these are summarized and explained below in nested dropdowns.

format_sample_metadata
  flowchart TD
    format_sample_metadata[[<b>format_sample_metadata</b>: <br>Transpile cluster ownership <br>from sample cluster assignment <br>into input format]]

    classDef python stroke:#FEBE10,fill:#D3D3D3,stroke-width:4px,color:black;
    class format_sample_metadata python;

A python script which uses Pandas to convert the provided samples.csv into a format suitable for Plink-2.

merge protocol *
  flowchart TD
    subgraph multipleVcfProtocol [Multiple dataset protocol]
            direction LR
            multipleVcfProtocolStart(((Start)))
            ifMergeRequired{Is a <br>merge needed?}
            merge_datasets[[<b>merge_datasets</b>: <br>Merge multiple incoming <br>datasets]]

            normalize_merged_datasets[[<b>normalize_merged_datasets</b>: <br>normalize any multi-allelic <br>records created by merge]]
            multipleVcfProtocolEnd(((End)))

            class merge_datasets,normalize_merged_datasets bcftools;
            multipleVcfProtocolStart --> ifMergeRequired
            ifMergeRequired --> |yes| merge_datasets --> normalize_merged_datasets --> multipleVcfProtocolEnd
            ifMergeRequired --> |No| multipleVcfProtocolEnd

            classDef bcftools stroke:#FF5733,fill:#D3D3D3,stroke-width:4px, color:black;
            class merge_datasets,normalize_merged_datasets bcftools;
        end

The merge rule is protected by a decision tree and only executes when multiple described datasets are detected. This rule is responsible for merging multiple datasets into a single VCF file, suitable for collective analysis.

merge_datasets
  flowchart TD
      merge_datasets[[<b>merge_datasets</b>: <br>Merge multiple incoming <br>datasets]]
      
      classDef bcftools stroke:#FF5733,fill:#D3D3D3,stroke-width:4px,color:black;
      class merge_datasets bcftools;
Function
Merge multiple incoming VCF files into a single VCF file.
Command
bcftools merge -O z -o {output} {input.vcf}
Parameters
-O z
Output format (-Oz denotes a BG-Zipped VCF output)
-o {output.vcf}
Output file.
normalize_merged_datasets
  flowchart TD
      normalize_merged_datasets[[<b>normalize_merged_datasets</b>: <br>normalize any multi-allelic <br>records created by merge]]
              
      normalize_merged_datasets[[**normalize_merged_datasets**: normalize any multi-allelic records created by merge]]

      classDef bcftools stroke:#FF5733,fill:#D3D3D3,stroke-width:4px,color:black;
      class normalize_merged_datasets bcftools;
Function
To normalize any multi-allelic records which may arise as a result of the merge.
Command
bcftools norm --multiallelics - -any -Oz -o {output} {input}
Parameters
--multiallelics -
Split multi-allelic records into bi-allelic records.
-O z
Output format (-Oz denotes a BG-Zipped VCF output).
-o {output.vcf}
Output file.
convert_to_pgen
flowchart TD
    convert_to_pgen[[<b>convert_to_pgen</b>: <br>Convert the VCF fields <br>into Plink-2 binary <br>format]]

    classDef plink stroke:#36454F,fill:#D3D3D3,stroke-width:4px,color:black;
    class convert_to_pgen plink;
Function
To convert text-based VCF files into binary Plink-2 PGEN format which is much more performant.
command
plink2 --threads {threads} --vcf {input.vcf} --update-sex {input.sample_metadata} --split-par hg38 --allow-extra-chr --make-pgen vzs --out {params.output}
Parameters
--threads {threads}
Used to set the number of CPU threads used during this calculation
--vcf {input.vcf}
Used to provide the location of the input file plink needs to work on.
--update-sex {input.sample_metadata}
Used to provide plink with sample annotations for downstream processing and use.
--split-par hg38
Used to indicate to plink to use standard coordinates for build hg38 when trimming off the PAR regions on chromosome X.
--allow-extra-chr
Used to indicate to Plink to expect non-standard chromosomes.
--make-pgen vzs
Used to indicate that output should be compiled in Plink-2 binary format (.pvar, .pgen and .psam files).
--out {params.output}
Used to declare the output location that should be used to create the file. File is specified without extension, which is added by Plink.
verify_records_against_reference_genome
  flowchart TD
    verify_records_against_reference_genome[[<b>verify_records_against_reference_genome</b>: <br>Check reference alleles against <br>provided reference genome]]

    classDef plink stroke:#36454F,fill:#D3D3D3,stroke-width:4px,color:black;
    class verify_records_against_reference_genome plink;
Function
To check each loci and comparing its listed reference to that provided in the reference genome.
Command
plink2 --threads {threads} --pfile {params.input} vzs --fa {params.ref} --ref-from-fa force --allow-extra-chr --make-pgen vzs --out {params.output}
Parameters
--threads {threads}
Used to set the number of CPU threads used during this calculation
--pfile {params.input} vzs
Used to provide plink with the location of a plink-2 binary file set (.psam, .pvar and .pgen files), and to expect z-compressed files.
--fa {params.ref}
File path to reference genome to be used for comparison.
--ref-from-fa force
Sets REF allele to provided reference FASTA when possible unambiguously (Does not apply to some INDELS)
--threads {threads}
Used to set the number of CPU threads used during this calculation
--pfile {params.input} vzs
Used to provide plink with the location of a plink-2 binary file set (.psam, .pvar and .pgen files), and to expect z-compressed files.
--make-pgen zs
Save output to a BG-Zipped pgen binary fileset.
--out {params.output}
Provide the file name and path for output creation.
remove_non_standard_chromosomes
  flowchart TD
    remove_non_standard_chromosomes[[<b>remove_non_standard_chromosomes</b>: <br>Filter out non-standard <br>chromosomes]]

    classDef plink stroke:#36454F,fill:#D3D3D3,stroke-width:4px,color:black;
    class remove_non_standard_chromosomes plink;
Function
To filter out non-standard chromosomes.
Command
plink2 --threads {threads} --pfile {params.input} vzs --allow-extra-chr --output-chr chr26 --chr 1-26 --make-pgen vzs --out {params.output}
Parameters
--threads {threads}
Used to set the number of CPU threads used during this calculation
--pfile {params.input} vzs
Used to provide plink with the location of a plink-2 binary file set (.psam, .pvar and .pgen files), and to expect z-compressed files.
--allow-extra-chr
Used to indicate to Plink to expect non-standard chromosomes.
--output-chr chr26
Sets chromosome code notation in output files to include the 'chr' as a prefix.
--chr 1-26
Request a subset of chromosomes to be included in the output file.
--make-pgen zs
Save output to a BG-Zipped pgen binary fileset.
--out {params.output}
Provide the file name and path for output creation.
remove_unknown_samples
  flowchart TD
    remove_unknown_samples[[<b>remove_unknown_samples</b>: <br>Subset samples to labeled <br>samples in metadata files]]

    classDef plink stroke:#36454F,fill:#D3D3D3,stroke-width:4px,color:black;
    class remove_unknown_samples plink;
Function
To remove unneeded samples. This is done by comparison against all provided sample annotations in the input/samples.csv metadata file.
Command
plink2 --threads {threads} --pfile {params.input} vzs --keep {input.sample_metadata} --make-pgen vzs --out {params.output} >{log} 2>&1
Parameters
--threads {threads}
Used to set the number of CPU threads used during this calculation
--pfile {params.input} vzs
Used to provide plink with the location of a plink-2 binary file set (.psam, .pvar and .pgen files), and to expect z-compressed files.
--keep {input.sample_metadata}
Keep only samples mentioned by name in provided samples.csv
--make-pgen zs
Save output to a BG-Zipped pgen binary fileset.
--out {params.output}
Provide the file name and path for output creation.
filter_variant_missingness
  flowchart TD
    filter_variant_missingness[[<b>filter_variant_missingness</b>: <br>Filter variants with 100% <br>missingness]]

    classDef plink stroke:#36454F,fill:#D3D3D3,stroke-width:4px,color:black;
    class filter_variant_missingness plink;
Function
To manage and remove regions of missing calls along the variant-level.
Command
plink2 --threads {threads} --pfile {params.input} vzs --geno 1.0 --make-pgen vzs --out {params.output}
Parameters
--threads {threads}
Used to set the number of CPU threads used during this calculation
--pfile {params.input} vzs
Used to provide plink with the location of a plink-2 binary file set (.psam, .pvar and .pgen files), and to expect z-compressed files.
--geno 1.0
Filters based on 100% variant missingness
--make-pgen zs
Save output to a BG-Zipped pgen binary fileset.
--out {params.output}
Output file.
filter_sample_missingness
  flowchart TD
    filter_sample_missingness[[<b>filter_sample_missingness</b>: <br>Filter samples with 100% <br>missingness]]

    classDef plink stroke:#36454F,fill:#D3D3D3,stroke-width:4px,color:black;
    class filter_sample_missingness plink;
Function
To manage and remove regions of missing calls along the sample-level.
Command
plink2 --threads {threads} --pfile {params.input} vzs --mind 1.0 --make-pgen vzs --out {params.output}
Parameters
--threads {threads}
Used to set the number of CPU threads used during this calculation
--pfile {params.input} vzs
Used to provide plink with the location of a plink-2 binary file set (.psam, .pvar and .pgen files), and to expect z-compressed files.
--mind 1.0
Filters out samples with more than 100% missingness
--make-pgen zs
Save output to a BG-Zipped pgen binary fileset.
--out {params.output}
Provide the file name and path for output creation.
calculate_sample_relatedness
  flowchart TD
    calculate_sample_relatedness[[<b>calculate_sample_relatedness</b>: <br>Calculate relatedness]]

    classDef plink stroke:#36454F,fill:#D3D3D3,stroke-width:4px,color:black;
    class calculate_sample_relatedness plink;
Function
To calculate sample relatedness using a kingship estimator provided by Plink-2.
Command
plink2 --threads {threads} --pfile {params.input} vzs --king-cutoff 0.354 --out {params.output}
Parameters
--threads {threads}
Used to set the number of CPU threads used during this calculation
--pfile {params.input} vzs
Used to provide plink with the location of a plink-2 binary file set (.psam, .pvar and .pgen files), and to expect z-compressed files.
--king-cutoff 0.354
Filters out samples which show a kingship estimator of greater than 0.354.
--out {params.output}
Provide the file name and path for output creation.
remove_related_samples
  flowchart TD
    calculate_sample_relatedness[[<b>calculate_sample_relatedness</b>: <br>Calculate relatedness]]

    classDef plink stroke:#36454F,fill:#D3D3D3,stroke-width:4px,color:black;
    class remove_related_samples plink;
Function
To filter out all but unrelated samples, given the list of samples to keep from its predecessor rules.
Command
plink2 --threads {threads} --pfile {params.input} vzs --keep {input.unrelated_samples} --make-pgen vzs --out {params.output}
Parameters
--threads {threads}
Used to set the number of CPU threads used during this calculation
--pfile {params.input} vzs
Used to provide plink with the location of a plink-2 binary file set (.psam, .pvar and .pgen files), and to expect z-compressed files.
--keep {input.sample_metadata}
Keep only samples mentioned by name in provided samples.csv
--make-pgen zs
Save output to a BG-Zipped pgen binary fileset.
--out {params.output}
Provide the file name and path for output creation.
extract_provided_coordinates
  flowchart TD
    extract_provided_coordinates[[<b>extract_provided_coordinates</b>: <br>Trim the dataset to one <br>of the studied regions]]

    classDef plink stroke:#36454F,fill:#D3D3D3,stroke-width:4px,color:black;
    class extract_provided_coordinates plink;
Function
To filter out all but unrelated samples, given the list of samples to keep from its predecessor rules.
Command
plink2 --threads {threads} --pfile {params.input} vzs --from-bp {params.fromBP} --to-bp {params.toBP} --chr {params.chr} --make-pgen vzs --out {params.output}
Parameters
--threads {threads}
Used to set the number of CPU threads used during this calculation
--pfile {params.input} vzs
Used to provide plink with the location of a plink-2 binary file set (.psam, .pvar and .pgen files), and to expect z-compressed files.
--from-bp
The start co-ordinates to start trimming from.
--to-bp
The stop coordinates to trim until.
--chr
The chromosome on which the coordinates can be found.
--make-pgen zs
Save output to a BG-Zipped pgen binary fileset.
--out {params.output}
Provide the file name and path for output creation.
Allele Count and Frequency protocol
  ---
  config:
      flowchart:
          defaultRenderer: elk
  ---
  flowchart TD
    subgraph variant_count[Variant Count]
      variant_count_START(((Start)))
      variant_count_END(((End)))
      report_count_partitioned_per_cluster[[<b>report_count_partitioned_per_cluster</b>: <br>Perform frequency analysis <br>across each cluster]]
      collect_variant_frequency[[<b>collect_variant_frequency</b>: <br>Collect cluster-level <br>variant frequency reports <br>into one]]
      collect_variant_count[[<b>collect_variant_count</b>: <br>Collect all per-cluster <br>variant count reports <br>into one]]

      variant_count_START --> report_count_partitioned_per_cluster & collect_variant_frequency & collect_variant_count
      report_count_partitioned_per_cluster --> collect_variant_frequency
      report_count_partitioned_per_cluster --> collect_variant_count

      collect_variant_count & collect_variant_frequency --> variant_count_END


      classDef plink stroke:#36454F,fill:#D3D3D3,stroke-width:4px,color:black;
      class report_count_partitioned_per_cluster plink;

      classDef python stroke:#FEBE10,fill:#D3D3D3,stroke-width:4px,color:black;
      class collect_variant_frequency,collect_variant_count python;
    end

This protocol is responsible for performing clustered variant count and frequency analysis. Two python scripts are used to collect the results of each clusters counts and frequencies respectively, and convert these into named columns for the final output.

report_count_partitioned_per_cluster
  flowchart TD    
    report_count_partitioned_per_cluster[[<b>report_count_partitioned_per_cluster</b>: <br>Perform frequency analysis <br>across each cluster]]

    classDef plink stroke:#36454F,fill:#D3D3D3,stroke-width:4px,color:black;
    class report_count_partitioned_per_cluster plink;
Function
To generate a frequency report.
Command
plink2 --threads {threads} --pfile {params.input} vzs --loop-cats {wildcards.cluster} --freq counts cols=chrom,pos,ref,alt,reffreq,altfreq,nobs --out {params.output}
Parameters
--threads {threads}
Used to set the number of CPU threads used during this calculation
--pfile {params.input} vzs
Used to provide plink with the location of a plink-2 binary file set (.psam, .pvar and .pgen files), and to expect z-compressed files.
--loop-cats {wildcards.cluster}
Perform the requested computation repeAtedly, sampeling across the provided clusters, as assigned via the provided samples.csv
--freq counts cols=chrom,pos,ref,alt,reffreq,altfreq,nobs
Calculate the frequency of each variant record. Include the following columns:
  • Chromosome
  • Position
  • Reference Allele
  • Alternate allele
  • Reference allele frequency
  • Alternate allele frequency
  • Number of observations
--out {params.output}
Provide the file name and path for output creation.
collect_variant_frequency
  flowchart TD
    collect_variant_frequency[[<b>collect_variant_frequency</b>: <br>Collect cluster-level <br>variant frequency reports <br>into one]]

    classDef python stroke:#FEBE10,fill:#D3D3D3,stroke-width:4px,color:black;
    class collect_variant_frequency python;
Function
A python script to compile multiple cluster-level variant-frequency results into a single report.
collect_variant_count
  flowchart TD
    collect_variant_count[[<b>collect_variant_count</b>: <br>Collect all per-cluster <br>variant count reports <br>into one]]

    classDef python stroke:#FEBE10,fill:#D3D3D3,stroke-width:4px,color:black;
    class collect_variant_count python;
Function
A python script to compile multiple cluster-level variant-count results into a single report.
Hardy-Weinberg Protocol
flowchart TD
subgraph hardy_weinberg[Hardy-Weinberg]
  hardy_weinberg_START(((Start)))
  hardy_weinberg_END(((End)))
  report_hardy_weinberg_per_cluster[[<b>report_hardy_weinberg_per_cluster</b>: <br>Perform HWE analysis <br>across each cluster]]
  collect_autosomal_hardy_weinberg[[<b>collect_autosomal_hardy_weinberg</b>: <br>Collect the HWE reports <br>for autosomal locations]]

  hardy_weinberg_START --> report_hardy_weinberg_per_cluster & collect_autosomal_hardy_weinberg
  report_hardy_weinberg_per_cluster --> collect_autosomal_hardy_weinberg
  collect_autosomal_hardy_weinberg --> hardy_weinberg_END

  classDef plink stroke:#36454F,fill:#D3D3D3,stroke-width:4px,color:black;
  class report_hardy_weinberg_per_cluster plink;

  classDef python stroke:#FEBE10,fill:#D3D3D3,stroke-width:4px,color:black;
  class collect_autosomal_hardy_weinberg python;
end
report_hardy_weinberg_per_cluster
  flowchart TD
    report_hardy_weinberg_per_cluster[[<b>report_hardy_weinberg_per_cluster</b>: <br>Perform HWE analysis <br>across each cluster]]

    classDef plink stroke:#36454F,fill:#D3D3D3,stroke-width:4px,color:black;
    class report_hardy_weinberg_per_cluster plink;
Function
To generate a hardy-weinberg report.
Command
plink2 --threads {threads} --pfile {params.input} vzs --loop-cats {wildcards.cluster} --hardy midp cols=chrom,pos,ref,alt,gcounts,hetfreq,p --out {params.output}
Parameters
--threads {threads}
Used to set the number of CPU threads used during this calculation
--pfile {params.input} vzs
Used to provide plink with the location of a plink-2 binary file set (.psam, .pvar and .pgen files), and to expect z-compressed files.
--loop-cats {wildcards.cluster}
Perform the requested computation repeAtedly, sampeling across the provided clusters, as assigned via the provided samples.csv
--hardy midp cols=chrom,pos,ref,alt,gcounts,hetfreq,p
Perform equilibrium, test on each variant record. Include the following columns:
  • Chromosome
  • Position
  • Reference Allele
  • Alternate allele
  • Genotype counts
  • Heterozygote frequency
  • P-value
--out {params.output}
Provide the file name and path for output creation.
collect_autosomal_hardy_weinberg
  flowchart TD
    collect_autosomal_hardy_weinberg[[<b>collect_autosomal_hardy_weinberg</b>: <br>Collect the HWE reports <br>for autosomal locations]]

    classDef python stroke:#FEBE10,fill:#D3D3D3,stroke-width:4px,color:black;
    class collect_autosomal_hardy_weinberg python;
Function
A python script to compile multiple cluster-level autosomal hardy-weinberg results into a single report.
Missingness Protocol
  flowchart TD
    subgraph variant_missingness[Variant Missingness]
      variant_missingness_START(((Start)))
      variant_missingness_END(((End)))
      report_missingness_per_cluster[[<b>report_missingness_per_cluster</b>: <br>Report the missingness rates <br>observed across each cluster]]

      collect_variant_missingness[[<b>collect_variant_missingness</b>: <br>Collect all per-cluster <br>variant missingness reports <br>into one]]
      variant_missingness_START --> report_missingness_per_cluster & collect_variant_missingness
      report_missingness_per_cluster --> collect_variant_missingness
      collect_variant_missingness --> variant_missingness_END

      classDef plink stroke:#36454F,fill:#D3D3D3,stroke-width:4px,color:black;
      class report_missingness_per_cluster plink;

      classDef python stroke:#FEBE10,fill:#D3D3D3,stroke-width:4px,color:black;
      class collect_variant_missingness python;
    end
report_missingness_per_cluster
  flowchart TD
    report_missingness_per_cluster[[<b>report_missingness_per_cluster</b>: <br>Report the missingness rates <br>observed across each cluster]]

    classDef plink stroke:#36454F,fill:#D3D3D3,stroke-width:4px,color:black;
    class report_missingness_per_cluster plink;
Function
To generate a missingness report.
Command
plink2 --threads {threads} --pfile {params.input} vzs --loop-cats {wildcards.cluster} --missing zs vcols=chrom,pos,ref,alt,provref,nmiss,nobs,fmiss --out {params.output}
Parameters
--threads {threads}
Used to set the number of CPU threads used during this calculation
--pfile {params.input} vzs
Used to provide plink with the location of a plink-2 binary file set (.psam, .pvar and .pgen files), and to expect z-compressed files.
--loop-cats {wildcards.cluster}
Perform the requested computation repeAtedly, sampeling across the provided clusters, as assigned via the provided samples.csv
--missing zs vcols=chrom,pos,ref,alt,provref,nmiss,nobs,fmiss
Perform missingness calculations on each variant record. Include the following columns:
  • Chromosome
  • Position
  • Reference Allele
  • Alternate allele
  • Provisional Reference
  • Number of missing
  • Number of observations
  • Frequency of missingness
--out {params.output}
Provide the file name and path for output creation.
collect_variant_missingness
  flowchart TD
    collect_variant_missingness[[<b>collect_variant_missingness</b>: <br>Collect all per-cluster <br>variant missingness reports <br>into one]]

    classDef python stroke:#FEBE10,fill:#D3D3D3,stroke-width:4px,color:black;
    class collect_variant_missingness python;
Function
A python script to compile multiple cluster-level variant-missingness results into a single report.
Fishers Exact Protocol
  flowchart TD
    subgraph fishers_exact[Fishers Exact w/t Bonferoni Correction]
      fishers_exact_START(((Start)))
      fishers_exact_END(((End)))
      report_fishers_exact_with_corrections[[<b>report_fishers_exact_with_corrections</b>: <br>Perform Fishers-Exact test <br>with Bonferonni correction]]

      fishers_exact_START --> report_fishers_exact_with_corrections --> fishers_exact_END

      classDef python stroke:#FEBE10,fill:#D3D3D3,stroke-width:4px,color:black;
      class report_fishers_exact_with_corrections python;
    end
report_fishers_exact_with_corrections
  flowchart TD
    report_fishers_exact_with_corrections[[<b>report_fishers_exact_with_corrections</b>: <br>Perform Fishers-Exact test <br>with Bonferonni correction]]

    classDef python stroke:#FEBE10,fill:#D3D3D3,stroke-width:4px,color:black;
    class report_fishers_exact_with_corrections python;
Function
A python script to perform corrected Fishers-Exact test (Bonferoni method) to identify significant differences in variant frequency between clusters.
Variant Effect Prediction Protocol
  flowchart TD
    subgraph vep[Variant Effect Prediction]
      vep_START(((Start)))
      vep_END(((End)))
      query_variant_effect_predictions[[<b>query_variant_effect_predictions</b>: <br>Perform API calls to <br>E! Ensemble REST API to <br>identify variants]]
      compile_variant_effect_predictions[[<b>compile_variant_effect_predictions</b>: <br>Collect the RAW API <br>payloads and extract <br>relevant metrics]]
      
      vep_START --> query_variant_effect_predictions & compile_variant_effect_predictions
      query_variant_effect_predictions --> compile_variant_effect_predictions
      compile_variant_effect_predictions --> vep_END

      classDef python stroke:#FEBE10,fill:#D3D3D3,stroke-width:4px,color:black;
      class query_variant_effect_predictions,compile_variant_effect_predictions python;
    end
  
query_variant_effect_predictions
  flowchart TD
    query_variant_effect_predictions[[<b>query_variant_effect_predictions</b>: <br>Perform API calls to <br>E! Ensemble REST API to <br>identify variants]]

    classDef python stroke:#FEBE10,fill:#D3D3D3,stroke-width:4px,color:black;
    class query_variant_effect_predictions python;
Function
A python script to perform batched API calls to the E! Ensembl REST APIs VEP tool to retrieve and store cached variant annotations.
compile_variant_effect_predictions
  flowchart TD
    compile_variant_effect_predictions[[<b>compile_variant_effect_predictions</b>: <br>Collect the RAW API <br>payloads and extract <br>relevant metrics]]

    classDef python stroke:#FEBE10,fill:#D3D3D3,stroke-width:4px,color:black;
    class compile_variant_effect_predictions python;
Function
A python script to extract relevant metrics from the returned annotations.
consolidate_reports
  flowchart TD
    consolidate_reports[[<b>consolidate_reports</b>: <br>Consolidate all the <br>generated reports <br>into one]]

    classDef python stroke:#FEBE10,fill:#D3D3D3,stroke-width:4px,color:black;
    class consolidate_reports python;
Function
A python script to collect and compile all output reports into a single output file for review.

This work is licensed under a Creative Commons Attribution 4.0 International License.. This project is managed by the Institute for Cellular and Molecular Medicine at the University of Pretoria.