Skip to content

Genomic Data Smoothing

LOWESS for methylation profiles, ChIP-seq signals, and other genomic data.

Overview

Genomic data often contains noise from sequencing depth variation, PCR artifacts, or biological heterogeneity. LOWESS smoothing helps reveal underlying patterns.


Methylation Profile Smoothing

The Challenge

DNA methylation data (from bisulfite sequencing or arrays) shows position-dependent patterns that can be obscured by measurement noise.

Solution

library(rfastlowess)

# Simulate methylation data
set.seed(42)
n <- 1000
positions <- sort(runif(n, 0, 1e6))

# True pattern
true_meth <- 0.5 + 0.3 * sin(positions / 1e5)

# Observed with noise
observed <- true_meth + rnorm(n, sd = 0.15)
observed <- pmax(0, pmin(1, observed))

# Smooth
result <- Lowess(
    fraction = 0.1,
    iterations = 3,
    confidence_intervals = 0.95
)$fit(positions, observed)

# Plot
plot(positions, observed, pch = ".", col = "gray",
     xlab = "Genomic Position (bp)", ylab = "Methylation Level",
     main = "Methylation Profile Smoothing")
lines(result$x, result$y, col = "blue", lwd = 2)
lines(result$x, result$confidence_lower, col = "blue", lty = 2)
lines(result$x, result$confidence_upper, col = "blue", lty = 2)
import fastlowess as fl
import numpy as np
import matplotlib.pyplot as plt

# Simulate methylation data along a chromosome
np.random.seed(42)
n_positions = 1000
positions = np.sort(np.random.uniform(0, 1e6, n_positions))

# True methylation pattern (varies along chromosome)
true_methylation = 0.5 + 0.3 * np.sin(positions / 1e5)

# Observed with noise
observed = true_methylation + np.random.normal(0, 0.15, n_positions)
observed = np.clip(observed, 0, 1)  # Methylation is 0-1

# Smooth with LOWESS
result = fl.smooth(
    positions, observed,
    fraction=0.1,           # Small fraction for local detail
    iterations=3,           # Robustness for outliers
    confidence_intervals=0.95
)

# Plot
plt.figure(figsize=(12, 5))
plt.scatter(positions, observed, s=2, alpha=0.3, label="Observed")
plt.plot(positions, result["y"], "b-", lwd=2, label="LOWESS smoothed")
plt.fill_between(
    positions,
    result["confidence_lower"],
    result["confidence_upper"],
    alpha=0.2, label="95% CI"
)
plt.xlabel("Genomic Position (bp)")
plt.ylabel("Methylation Level")
plt.legend()
plt.title("Methylation Profile Smoothing")
plt.show()
use fastLowess::prelude::*;
use ndarray::Array1;

let positions: Array1<f64> = /* sorted genomic positions */;
let observed: Array1<f64> = /* methylation levels 0-1 */;

let model = Lowess::new()
    .fraction(0.1)
    .iterations(3)
    .confidence_intervals(0.95)
    .adapter(Batch)
    .build()?;

let result = model.fit(&positions, &observed)?;
// result.y contains smoothed methylation profile
// result.confidence_lower/upper contain 95% CI bounds
using FastLOWESS

# positions and observed are your methylation data
result = smooth(
    positions, observed,
    fraction=0.1,
    iterations=3,
    confidence_intervals=0.95
)

# Smoothed profile in result.y
# CI bounds in result.confidence_lower/upper
const fl = require('fastlowess');

// positions and observed are your methylation data (Float64Array)
const result = fl.smooth(positions, observed, {
    fraction: 0.1,
    iterations: 3,
    confidenceIntervals: 0.95
});

// Smoothed profile in result.y
// CI bounds in result.confidenceLower/Upper
import { smooth } from 'fastlowess-wasm';

// positions and observed are your methylation data (Float64Array)
const result = smooth(positions, observed, {
    fraction: 0.1,
    iterations: 3,
    confidenceIntervals: 0.95
});

// Smoothed profile in result.y
// CI bounds in result.confidenceLower/Upper
#include "fastlowess.hpp"

// positions and observed are std::vector<double>
auto result = fastlowess::smooth(positions, observed, {
    .fraction = 0.1,
    .iterations = 3,
    .confidence_intervals = 0.95
});

// Smoothed profile in result.yVector()
// CI bounds in result.confidenceLower()/result.confidenceUpper()

ChIP-seq Signal Smoothing

Application

ChIP-seq experiments produce sparse, noisy coverage data. LOWESS can help identify binding regions.

set.seed(123)
positions <- seq(0, 10000, by = 10)
n <- length(positions)

# Simulate peaks
background <- 10
peak1 <- 50 * exp(-((positions - 2000)^2) / (2 * 200^2))
peak2 <- 80 * exp(-((positions - 5000)^2) / (2 * 300^2))
peak3 <- 40 * exp(-((positions - 8000)^2) / (2 * 150^2))

true_signal <- background + peak1 + peak2 + peak3
observed <- rpois(n, true_signal)

result <- Lowess(
    fraction = 0.05,
    iterations = 5
)$fit(positions, observed)

# Find peaks
threshold <- quantile(result$y, 0.75)
peak_positions <- positions[result$y > threshold]
# Simulate ChIP-seq coverage with peaks
np.random.seed(123)
positions = np.arange(0, 10000, 10)
n = len(positions)

# Background + peaks
background = 10
peak1 = 50 * np.exp(-((positions - 2000) ** 2) / (2 * 200 ** 2))
peak2 = 80 * np.exp(-((positions - 5000) ** 2) / (2 * 300 ** 2))
peak3 = 40 * np.exp(-((positions - 8000) ** 2) / (2 * 150 ** 2))

true_signal = background + peak1 + peak2 + peak3
observed = np.random.poisson(true_signal)  # Poisson noise

# Smooth with robustness for sporadic high counts
result = fl.smooth(
    positions, observed.astype(float),
    fraction=0.05,   # Very local smoothing
    iterations=5,    # Strong robustness
    return_residuals=True
)

# Identify peaks (smoothed signal significantly above background)
threshold = np.percentile(result["y"], 75)
peaks = positions[result["y"] > threshold]
print(f"Peak regions: {peaks}")
let positions: Array1<f64> = Array1::range(0.0, 10000.0, 10.0);
let observed: Array1<f64> = /*ChIP-seq counts*/;

let model = Lowess::new()
    .fraction(0.05)
    .iterations(5)
    .return_residuals()
    .adapter(Batch)
    .build()?;

let result = model.fit(&positions, &observed)?;

// Find peaks above threshold
let threshold = /* compute 75th percentile */;
let peak_positions: Vec<f64> = positions
    .iter()
    .zip(result.y.iter())
    .filter(|(_, &y)| y > threshold)
    .map(|(&p, _)| p)
    .collect();
using FastLOWESS

# positions and observed are your ChIP-seq data
result = smooth(
    positions, observed,
    fraction=0.05,
    iterations=5
)

# Find peaks above 75th percentile
threshold = quantile(result.y, 0.75)
peak_indices = findall(y -> y > threshold, result.y)
peak_positions = positions[peak_indices]
const fl = require('fastlowess');

const result = fl.smooth(positions, observed, {
    fraction: 0.05,
    iterations: 5
});

// Identify peaks above threshold
const smoothed = result.y;
const threshold = 50.0; // Example threshold
const peaks = positions.filter((p, i) => smoothed[i] > threshold);
import { smooth } from 'fastlowess-wasm';

const result = smooth(positions, observed, {
    fraction: 0.05,
    iterations: 5
});

// Find peaks
const smoothed = result.y;
const peaks = positions.filter((p, i) => smoothed[i] > 25.0);
#include "fastlowess.hpp"

auto result = fastlowess::smooth(positions, observed, {
    .fraction = 0.05,
    .iterations = 5
});

// Find peaks above threshold
std::vector<double> peaks;
for (size_t i = 0; i < result.size(); ++i) {
    if (result.y(i) > 25.0) {
        peaks.push_back(result.x(i));
    }
}

Large Genome Coverage (Streaming)

For whole-genome data that doesn't fit in memory:

model <- StreamingLowess(
    fraction = 0.05,
    chunk_size = 100000,
    overlap = 10000,
    merge_strategy = "weighted"
)
result <- model$process_chunk(positions, coverage)
final <- model$finalize()
# Process chromosome-by-chromosome or in chunks
result = fl.smooth_streaming(
    positions, coverage,
    fraction=0.05,
    chunk_size=100000,    # 100kb chunks
    overlap=10000,        # 10kb overlap
    merge_strategy="weighted"
)
use fastLowess::prelude::*;

let model = Lowess::new()
    .fraction(0.05)
    .iterations(3)
    .adapter(Streaming {
        chunk_size: 100_000,
        overlap: 10_000,
        merge_strategy: Weighted,
    })
    .build()?;

// Process chunks from file or stream
let mut processor = model.processor();
for chunk in chromosome_chunks {
    processor.process_chunk(&chunk.positions, &chunk.coverage)?;
}
let result = processor.finalize()?;
using FastLOWESS

# coverage and positions are chromosome-scale vectors
result = smooth_streaming(
    positions, coverage,
    fraction=0.05,
    chunk_size=100000,
    overlap=10000,
    merge_strategy="weighted"
)
const { StreamingLowess } = require('fastlowess');

const processor = new StreamingLowess(
    { fraction: 0.05, iterations: 3 },
    { chunkSize: 100000, overlap: 10000 }
);

// Process genomic chunks from stream or file
for (const chunk of genomicData) {
    processor.processChunk(chunk.positions, chunk.coverage);
}
const result = processor.finalize();
import { StreamingLowessWasm } from 'fastlowess-wasm';

const processor = new StreamingLowessWasm(
    { fraction: 0.05, iterations: 3 },
    { chunkSize: 100000, overlap: 10000 }
);

// Process chunks
for (const chunk of stream) {
    processor.processChunk(chunk.positions, chunk.coverage);
}
const result = processor.finalize();
#include "fastlowess.hpp"

// coverage and positions are chromosome-scale vectors
auto result = fastlowess::smooth_streaming(
    positions, coverage,
    { .fraction = 0.05, .iterations = 3 },
    { .chunk_size = 100000, .overlap = 10000 }
);

Best Practices for Genomic Data

Consideration Recommendation
Fraction 0.05–0.15 (preserve local features)
Iterations 3–5 (handle sequencing outliers)
Large data Use streaming mode
Sparse regions Use boundary_policy="extend"
Multiple chromosomes Process separately or ensure sorted

See Also