Advanced Parameter Tuning

Segmentation Parameters

The segmentation behavior can be controlled by several parameters:

# More sensitive segmentation (more segments)
battenberg(
  # ... other parameters ...
  segmentation_gamma = 25,  # Higher = more segments
  segmentation_kmin = 1,    # Minimum segment size
  phasing_kmin = 1         # Minimum size for phasing
)

# Less sensitive segmentation (fewer segments)
battenberg(
  # ... other parameters ...
  segmentation_gamma = 5,   # Lower = fewer segments
  segmentation_kmin = 5,    # Larger minimum segment size
  phasing_kmin = 3         # Larger minimum size for phasing
)

Purity and Ploidy Constraints

Adjust expected ranges based on sample characteristics:

# High purity sample
battenberg(
  # ... other parameters ...
  min_rho = 0.8,           # Minimum 80% purity
  max_ploidy = 6.0,        # Allow higher ploidy
  min_goodness = 0.65      # Stricter goodness of fit
)

# Low purity sample
battenberg(
  # ... other parameters ...
  min_rho = 0.3,           # Allow 30% purity
  max_ploidy = 3.5,        # Lower ploidy range
  min_goodness = 0.55      # More lenient goodness
)

Quality Control Parameters

# Strict quality control
battenberg(
  # ... other parameters ...
  min_normal_depth = 15,   # Higher coverage requirement
  min_base_qual = 25,      # Higher base quality
  min_map_qual = 40,       # Higher mapping quality
  uninformative_BAF_threshold = 0.49  # Stricter BAF threshold
)

Using Prior Structural Variant Breakpoints

Battenberg can incorporate prior breakpoints from structural variant calls:

# Create prior breakpoints file (2 columns: chr, pos)
prior_breakpoints <- data.frame(
  chr = c("1", "1", "2", "3"),
  pos = c(1500000, 2500000, 5000000, 1000000)
)

write.table(prior_breakpoints, "prior_breakpoints.txt", 
            row.names = FALSE, col.names = FALSE, 
            quote = FALSE, sep = "\t")

# Use in Battenberg
battenberg(
  # ... other parameters ...
  prior_breakpoints_file = "prior_breakpoints.txt"
)

Using Beagle5 for Imputation

For improved phasing, especially with newer reference panels:

# Setup Beagle5 parameters
BEAGLEJAR <- "path/to/beagle.24Aug19.3e8.jar"
BEAGLEREF_TEMPLATE <- "path/to/beagle_ref_chrCHROMNAME.1kg.phase3.v5a.b37.bref3"
BEAGLEPLINK_TEMPLATE <- "path/to/plink.chrCHROMNAME.GRCh37.map"

battenberg(
  # ... other parameters ...
  usebeagle = TRUE,
  beaglejar = BEAGLEJAR,
  beagleref = BEAGLEREF_TEMPLATE,
  beagleplink = BEAGLEPLINK_TEMPLATE,
  beaglemaxmem = 16,       # Memory in GB
  beaglenthreads = 4,      # Threads for Beagle
  beaglewindow = 40,       # Window size
  beagleoverlap = 4        # Overlap size
)

Multisample Analysis

For analyzing multiple samples together:

# Define multiple samples
tumournames <- c("sample1_tumor", "sample2_tumor", "sample3_tumor")
normalnames <- c("sample1_normal", "sample2_normal", "sample3_normal")
tumourbams <- c("path/to/sample1_tumor.bam", "path/to/sample2_tumor.bam", 
                "path/to/sample3_tumor.bam")
normalbams <- c("path/to/sample1_normal.bam", "path/to/sample2_normal.bam", 
                "path/to/sample3_normal.bam")

# Run multisample analysis
battenberg(
  tumourname = tumournames,
  normalname = normalnames,
  tumour_data_file = tumourbams,
  normal_data_file = normalbams,
  # ... other parameters ...
  multisample_maxlag = 150,  # Max upstream SNPs for multisample phasing
  multisample_relative_weight_balanced = 0.5,  # Weight for balanced samples
  write_battenberg_phasing = TRUE  # Write phasing results
)

Cell Line Analysis

For cell line data (tumor-only analysis):

battenberg(
  analysis = "cell_line",  # Changed from default "paired"
  tumourname = "cell_line_sample",
  normalname = NA,         # No normal sample
  tumour_data_file = "path/to/cell_line.bam",
  normal_data_file = NA,   # No normal BAM
  # ... other parameters adjusted for cell line analysis ...
  min_rho = 0.95,         # Expect high purity
  min_goodness = 0.7      # Stricter goodness for cell lines
)

SNP Array Analysis

For SNP6 array data:

battenberg(
  # ... other parameters ...
  data_type = "snp6",
  platform_gamma = 1,
  snp6_reference_info_file = "path/to/snp6_reference_info.txt",
  apt.probeset.genotype.exe = "apt-probeset-genotype",
  apt.probeset.summarize.exe = "apt-probeset-summarize",
  norm.geno.clust.exe = "normalize_affy_geno_cluster.pl",
  birdseed_report_file = "birdseed.report.txt"
)

Performance Optimization

Parallel Processing

# Use more threads for faster processing
battenberg(
  # ... other parameters ...
  nthreads = 16,           # Use 16 CPU cores
  beaglenthreads = 8       # Use 8 cores for Beagle (if using)
)

Memory Management

# For large datasets, adjust memory settings
battenberg(
  # ... other parameters ...
  beaglemaxmem = 32,       # 32GB for Beagle
  # Consider running chromosomes separately for very large files
)

Skipping Steps

For rerunning parts of the analysis:

# Skip allele counting if already done
battenberg(
  # ... other parameters ...
  skip_allele_counting = TRUE,
  skip_preprocessing = FALSE,
  skip_phasing = FALSE
)

# Skip preprocessing if rerunning
battenberg(
  # ... other parameters ...
  skip_allele_counting = FALSE,
  skip_preprocessing = TRUE,
  skip_phasing = FALSE
)

Custom Genome Builds

For different reference genomes:

# Specify genome build
battenberg(
  # ... other parameters ...
  GENOMEBUILD = "hg38",  # or "hg19"
  # Ensure reference files match the specified build
)

External Haplotype Files

Using external phasing information:

battenberg(
  # ... other parameters ...
  externalhaplotypefile = "path/to/external_haplotypes.vcf",
  write_battenberg_phasing = TRUE
)

Troubleshooting Common Issues

Low Quality Samples

  • Increase min_normal_depth and quality thresholds
  • Adjust min_goodness to be more lenient
  • Check coverage uniformity

Highly Aneuploid Samples

  • Increase max_ploidy range
  • Adjust segmentation_gamma for appropriate segment resolution

Contaminated Samples

  • Lower min_rho threshold
  • Consider pre-processing to estimate contamination

Memory Issues

  • Reduce beaglemaxmem if running out of memory
  • Process chromosomes separately
  • Use fewer threads if memory-limited

Quality Assessment

After running Battenberg, assess quality using:

  1. Distance plot: Check purity/ploidy solution space
  2. Profile plots: Examine copy number profiles for artifacts
  3. Coverage plots: Verify uniform coverage
  4. BAF plots: Check for proper phase separation
# Example quality check
cn_data <- read.delim("sample_tumor_copynumber.txt")

# Check for very short segments (potential artifacts)
short_segments <- cn_data[cn_data$endpos - cn_data$startpos < 1000000, ]
if(nrow(short_segments) > 0) {
  cat("Warning: Found", nrow(short_segments), "segments < 1Mb\n")
}

# Check purity estimate
rho_psi <- read.delim("sample_tumor_rho_and_psi.txt")
purity <- rho_psi$rho[2]
if(purity < 0.3) {
  cat("Warning: Low estimated purity:", purity, "\n")
}