vignettes/data-interpretation.Rmd
data-interpretation.Rmd
library(Battenberg)
library(ggplot2)
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, unionThe main output file [samplename]_copynumber.txt contains detailed copy number information:
# Read the copy number data
cn_data <- read.delim("sample_tumor_copynumber.txt")
# Examine the structure
head(cn_data)
colnames(cn_data)Basic Information: - chr: Chromosome - startpos, endpos: Segment boundaries - BAF: B-allele frequency - LogR: Log ratio of tumor vs normal coverage
Statistical Measures: - pval: P-value for subclonal vs clonal model - ntot: Internal copy number value (NOT total copy number)
Copy Number States (Solution A): - nMaj1_A, nMin1_A: Major/minor allele copy numbers for state 1 - frac1_A: Fraction of tumor cells with state 1 - nMaj2_A, nMin2_A: Major/minor alleles for state 2 (if subclonal) - frac2_A: Fraction of tumor cells with state 2 - SDfrac_A: Standard deviation of BAF (uncertainty measure)
# Identify clonal segments (single copy number state)
clonal_segments <- cn_data[cn_data$frac1_A == 1, ]
# Identify subclonal segments (two copy number states)
subclonal_segments <- cn_data[cn_data$frac1_A < 1 & !is.na(cn_data$frac2_A), ]
cat("Clonal segments:", nrow(clonal_segments), "\n")
cat("Subclonal segments:", nrow(subclonal_segments), "\n")
# For clonal segments
cn_data$total_cn_clonal <- cn_data$nMaj1_A + cn_data$nMin1_A
# For subclonal segments, calculate weighted average
cn_data$total_cn_subclonal <- ifelse(
!is.na(cn_data$nMaj2_A),
(cn_data$nMaj1_A + cn_data$nMin1_A) * cn_data$frac1_A +
(cn_data$nMaj2_A + cn_data$nMin2_A) * cn_data$frac2_A,
cn_data$nMaj1_A + cn_data$nMin1_A
)
# Primary copy number for analysis
cn_data$primary_total_cn <- ifelse(
is.na(cn_data$total_cn_subclonal),
cn_data$total_cn_clonal,
cn_data$total_cn_subclonal
)
# Classify events based on copy number
cn_data$cn_event <- case_when(
cn_data$primary_total_cn == 0 ~ "Homozygous deletion",
cn_data$primary_total_cn == 1 ~ "Heterozygous deletion",
cn_data$primary_total_cn == 2 ~ "Neutral",
cn_data$primary_total_cn == 3 ~ "Single copy gain",
cn_data$primary_total_cn == 4 ~ "Two copy gain",
cn_data$primary_total_cn >= 5 ~ "High-level amplification",
TRUE ~ "Other"
)
# Summary of events
table(cn_data$cn_event)
# LOH occurs when minor allele copy number is 0
cn_data$loh_state1 <- cn_data$nMin1_A == 0
cn_data$loh_state2 <- ifelse(is.na(cn_data$nMin2_A), FALSE, cn_data$nMin2_A == 0)
# Clonal LOH (all cells have LOH)
clonal_loh <- cn_data[cn_data$loh_state1 & cn_data$frac1_A == 1, ]
# Subclonal LOH (some cells have LOH)
subclonal_loh <- cn_data[
(cn_data$loh_state1 & cn_data$frac1_A < 1) |
(cn_data$loh_state2 & !is.na(cn_data$frac2_A)),
]
cat("Clonal LOH segments:", nrow(clonal_loh), "\n")
cat("Subclonal LOH segments:", nrow(subclonal_loh), "\n")
# Calculate total LOH burden
total_loh_length <- sum(clonal_loh$endpos - clonal_loh$startpos) +
sum(subclonal_loh$endpos - subclonal_loh$startpos)
# Genome size (approximate)
genome_size <- 3e9
loh_fraction <- total_loh_length / genome_size
cat("LOH burden:", round(loh_fraction * 100, 2), "% of genome\n")
# Count segments per chromosome
segments_per_chr <- table(cn_data$chr)
# Total number of segments
total_segments <- nrow(cn_data)
# Segments with copy number alterations
altered_segments <- sum(cn_data$primary_total_cn != 2)
cat("Total segments:", total_segments, "\n")
cat("Altered segments:", altered_segments, "\n")
cat("Fraction altered:", round(altered_segments/total_segments, 3), "\n")
# Chromothripsis: many segments in localized regions
# Look for chromosomes with >10 segments in <50Mb regions
identify_chromothripsis <- function(chr_data, min_segments = 10, max_size = 50e6) {
chr_data <- chr_data[order(chr_data$startpos), ]
for(i in 1:(nrow(chr_data) - min_segments + 1)) {
window_end <- i + min_segments - 1
region_size <- chr_data$endpos[window_end] - chr_data$startpos[i]
if(region_size <= max_size) {
return(TRUE)
}
}
return(FALSE)
}
# Check each chromosome
chromothripsis_chrs <- c()
for(chr in unique(cn_data$chr)) {
chr_data <- cn_data[cn_data$chr == chr, ]
if(identify_chromothripsis(chr_data)) {
chromothripsis_chrs <- c(chromothripsis_chrs, chr)
}
}
cat("Potential chromothripsis chromosomes:", paste(chromothripsis_chrs, collapse = ", "), "\n")
# Read purity/ploidy file
rho_psi <- read.delim("sample_tumor_rho_and_psi.txt")
# Extract values (use second row for FRAC_genome)
tumor_purity <- rho_psi$rho[2]
tumor_ploidy <- rho_psi$psi[2]
cat("Tumor purity (rho):", round(tumor_purity, 3), "\n")
cat("Tumor ploidy (psi):", round(tumor_ploidy, 3), "\n")
# Check consistency with copy number data
# Normal regions should have total CN close to purity-adjusted diploid
normal_regions <- cn_data[cn_data$primary_total_cn == 2, ]
expected_logr <- log2(tumor_purity * 2 + (1 - tumor_purity) * 2) - log2(2)
# Compare with observed LogR in normal regions
observed_logr <- median(normal_regions$LogR)
cat("Expected LogR for normal regions:", round(expected_logr, 3), "\n")
cat("Observed LogR for normal regions:", round(observed_logr, 3), "\n")
# Plot distribution of subclonal fractions
subclonal_fractions <- cn_data$frac2_A[!is.na(cn_data$frac2_A)]
if(length(subclonal_fractions) > 0) {
hist(subclonal_fractions, breaks = 20,
main = "Distribution of Subclonal Fractions",
xlab = "Fraction of cells with subclonal state",
ylab = "Number of segments")
}
# Create comprehensive summary
analysis_summary <- list(
sample_info = list(
tumor_purity = tumor_purity,
tumor_ploidy = tumor_ploidy,
total_segments = nrow(cn_data),
altered_segments = sum(cn_data$primary_total_cn != 2)
),
copy_number_events = list(
homozygous_deletions = sum(cn_data$primary_total_cn == 0),
heterozygous_deletions = sum(cn_data$primary_total_cn == 1),
neutral_regions = sum(cn_data$primary_total_cn == 2),
single_copy_gains = sum(cn_data$primary_total_cn == 3),
two_copy_gains = sum(cn_data$primary_total_cn == 4),
high_level_amplifications = sum(cn_data$primary_total_cn >= 5)
),
subclonal_info = list(
subclonal_segments = nrow(subclonal_segments),
significant_subclonal = nrow(significant_subclonal),
clonal_loh_segments = nrow(clonal_loh),
subclonal_loh_segments = nrow(subclonal_loh)
)
)
# Print summary
print(analysis_summary)
# Create detailed results table
results_table <- cn_data %>%
select(chr, startpos, endpos, primary_total_cn, cn_event,
nMaj1_A, nMin1_A, frac1_A, nMaj2_A, nMin2_A, frac2_A, pval) %>%
filter(primary_total_cn != 2) # Only altered segments
# Write results
write.table(results_table, "battenberg_analysis_results.txt",
sep = "\t", row.names = FALSE, quote = FALSE)
# Write summary
write.table(analysis_summary, "battenberg_analysis_summary.txt",
sep = "\t", row.names = TRUE, quote = FALSE)
# Create simple copy number profile
cn_data$midpoint <- (cn_data$startpos + cn_data$endpos) / 2
# Plot by chromosome
ggplot(cn_data, aes(x = midpoint, y = primary_total_cn)) +
geom_point(size = 0.5) +
geom_line() +
facet_wrap(~chr, scales = "free_x") +
labs(title = "Copy Number Profile",
x = "Genomic Position",
y = "Total Copy Number") +
theme_minimal()
# Plot subclonal fractions across genome
subclonal_data <- cn_data[!is.na(cn_data$frac2_A), ]
if(nrow(subclonal_data) > 0) {
ggplot(subclonal_data, aes(x = midpoint, y = frac2_A)) +
geom_point(aes(color = chr), size = 1) +
facet_wrap(~chr, scales = "free_x") +
labs(title = "Subclonal Fractions Across Genome",
x = "Genomic Position",
y = "Fraction of Cells with Subclonal State") +
theme_minimal() +
theme(legend.position = "none")
}
# Use utility functions for refit analysis
# Calculate refit values for specific segments
refit_result <- calc_rho_psi_refit(
refBAF = 0.65, # BAF of segment
refLogR = 0.3, # LogR of segment
refMajor = 3, # Major allele CN
refMinor = 1, # Minor allele CN
rho = tumor_purity, # Current purity
gamma_param = 1 # Platform gamma
)
print(refit_result)
# Generate refit suggestions from results
cnfit_to_refit_suggestions(
samplename = "sample_tumor",
subclones_file = "sample_tumor_copynumber.txt",
rho_psi_file = "sample_tumor_rho_and_psi.txt",
gamma_param = 1,
min_segment_size_mb = 2
)
# Check for potential issues
quality_checks <- list(
# Very short segments might be artifacts
short_segments = sum((cn_data$endpos - cn_data$startpos) < 1e6),
# Very low purity might indicate contamination
low_purity = tumor_purity < 0.3,
# Excessive segmentation might indicate noise
excessive_segments = nrow(cn_data) > 1000,
# Check for failed subclonal calls
failed_subclonal = sum(is.na(cn_data$frac1_A))
)
print(quality_checks)