Haplotagging (Long-read DNA)

Phase small and structural variants in long-read DNA sequencing BAM files

usage

nexus run --nf-workflow haplotagging_long-read-dna.nf -params-file params.yaml
Note

Nextflow config files are available here. Use the config file that matches your installed nexus version (e.g. nexus_v0.2.0_nextflow_slurm.config).

parameters

Download params.yaml

# =============================================================================
# params.yaml — haplotagging_long-read-dna
#
# Usage:
#   nextflow run haplotagging_long-read-dna.nf -params-file params.yaml
#
# Three orthogonal selectors decide what runs (CSV list, "all", or "none"/""):
#   small_variant_callers       {deepvariant, longshot, clair3}
#   structural_variant_callers  {pbsv}
#   phasing_methods             {whatshap, hiphase, hapcut2-whatshap,
#                                longphase, margin}
#
# Each requested phaser runs ONCE PER selected small-variant caller, with
# outputs at ${output_dir}/${sample_id}/<caller>_<phaser>/.
#
# Special cases:
#   * Longshot calls AND phases in one step — its phased BAM publishes
#     whenever "longshot" is in small_variant_callers.
#   * 'hiphase' requires structural_variant_callers="pbsv".
#   * DeepVariant → HapCUT2 and Clair3 → HiPhase paths are NOT wired
#     (silent skip + log warn).
# =============================================================================


# ---- required: input/output paths -------------------------------------------

# TSV file with columns: sample_id, bam_file, bam_bai_file
samples_tsv_file: ""

# Directory to which output files will be copied
output_dir: ""

# Reference genome FASTA file (may be .fa or .fa.gz)
reference_genome_fasta_file: ""


# ---- required: pipeline selectors -------------------------------------------

small_variant_callers: "all"
structural_variant_callers: "all"
phasing_methods: "all"

# Output format for haplotagged reads.
#   "bam"   →  publish full haplotagged BAM(s) (default).
#   "tsv"   →  publish only a haplotag TSV per phaser (~1000× smaller).
#   "both"  →  publish BAM and TSV.
# Phased VCFs are always published.
haplotag_output: "bam"


# =============================================================================
# Tool-specific settings (only consumed when the corresponding tool runs).
# =============================================================================

# DeepVariant — input_path / output_path are host paths bind-mounted into the
# container. Both REQUIRED when running DeepVariant.
deepvariant:
  input_path:  ""
  output_path: ""
  containerization: "singularity"   # "singularity" | "docker"
  model_type: "PACBIO"              # PACBIO | ONT_R104 | WGS | WES | HYBRID_PACBIO_ILLUMINA
  bin_path: "/opt/deepvariant/bin/run_deepvariant"
  bin_version: "1.9.0"

# Longshot
longshot:
  extra_args: ""

# Clair3 — set extra_args to match your chemistry. Common defaults inside the image:
#   PacBio HiFi (Sequel II) :  --model_path=/opt/models/hifi_sequel2/ --platform=hifi
#   PacBio HiFi (Revio)     :  --model_path=/opt/models/hifi_revio/   --platform=hifi
#   ONT R10.4               :  --model_path=/opt/models/ont_r10/      --platform=ont
clair3:
  extra_args: "--model_path=/opt/models/hifi_sequel2/ --platform=hifi --min_coverage=3"

# pbsv (discover + call)
pbsv:
  discover_extra_args: ""
  call_extra_args: ""

# WhatsHap (also used as fallback for hapcut2_whatshap.whatshap_haplotag_extra_args)
whatshap:
  phase_extra_args: "--mapq 20"
  haplotag_extra_args: "--ignore-read-groups --skip-missing-contigs --output-threads 4"

# HiPhase has no extra-args knobs in this workflow.

# HapCUT2 + WhatsHap haplotag
hapcut2_whatshap:
  read_technology: "pacbio"          # "pacbio" | "ont" | "illumina"
  extracthairs_extra_args: ""
  hapcut2_extra_args: ""
  whatshap_haplotag_extra_args: "--ignore-read-groups --skip-missing-contigs --output-threads 4"

# LongPhase — `phase` requires exactly one of:
#   --ont    (Oxford Nanopore)
#   --pb     (PacBio HiFi/CCS or CLR — both use --pb)
# `haplotag` has no platform flag — leave haplotag_extra_args empty.
longphase:
  phase_extra_args: "--pb"
  haplotag_extra_args: ""

# Margin — `margin phase` (v2.3.1) emits both phased VCF and haplotagged BAM in
# one call, so only the phase params JSON is needed. Pick a host-accessible
# JSON matching your sequencing platform; the margin docker image ships
# defaults under /opt/margin/params/phase/, e.g.
#   PacBio HiFi :  allParams.phase_vcf.pb-hifi.json
#   ONT R10.4   :  allParams.phase_vcf.ont.json
margin:
  phase_params_json_file: ""
  phase_extra_args: ""