Genomic Data Smoothing¶

LOWESS for methylation profiles, ChIP-seq signals, and other genomic data.

Overview¶

Genomic data often contains noise from sequencing depth variation, PCR artifacts, or biological heterogeneity. LOWESS smoothing helps reveal underlying patterns.

Methylation Profile Smoothing¶

The Challenge¶

DNA methylation data (from bisulfite sequencing or arrays) shows position-dependent patterns that can be obscured by measurement noise.

Solution¶

RPythonRustJuliaNode.jsWebAssemblyC++

library(rfastlowess)
set.seed(42)
positions <- seq(0, 10000, by = 10)
observed <- 50 + sin(positions / 100) * 20 + rnorm(length(positions), sd = 5)

library(rfastlowess)

# Simulate methylation data
set.seed(42)
n <- 1000
positions <- sort(runif(n, 0, 1e6))

# True pattern
true_meth <- 0.5 + 0.3 * sin(positions / 1e5)

# Observed with noise
observed <- true_meth + rnorm(n, sd = 0.15)
observed <- pmax(0, pmin(1, observed))

# Smooth
model <- Lowess(
    fraction = 0.1,
    iterations = 3,
    confidence_intervals = 0.95
)
result <- model$fit(positions, observed)

# Plot
plot(positions, observed, pch = ".", col = "gray",
     xlab = "Genomic Position (bp)", ylab = "Methylation Level",
     main = "Methylation Profile Smoothing")
lines(result$x, result$y, col = "blue", lwd = 2)
lines(result$x, result$confidence_lower, col = "blue", lty = 2)
lines(result$x, result$confidence_upper, col = "blue", lty = 2)

import fastlowess as fl
import numpy as np
import matplotlib.pyplot as plt

# Simulate methylation data along a chromosome
np.random.seed(42)
n_positions = 1000
positions = np.sort(np.random.uniform(0, 1e6, n_positions))

# True methylation pattern (varies along chromosome)
true_methylation = 0.5 + 0.3 * np.sin(positions / 1e5)

# Observed with noise
observed = true_methylation + np.random.normal(0, 0.15, n_positions)
observed = np.clip(observed, 0, 1)  # Methylation is 0-1

# Smooth with LOWESS
model = fl.Lowess(
    fraction=0.1,           # Small fraction for local detail
    iterations=3,           # Robustness for outliers
    confidence_intervals=0.95
)
result = model.fit(positions, observed)

# Plot
plt.figure(figsize=(12, 5))
plt.scatter(positions, observed, s=2, alpha=0.3, label="Observed")
plt.plot(positions, result.y, "b-", linewidth=2, label="LOWESS smoothed")
plt.fill_between(
    positions,
    result.confidence_lower,
    result.confidence_upper,
    alpha=0.2, label="95% CI"
)
plt.xlabel("Genomic Position (bp)")
plt.ylabel("Methylation Level")
plt.legend()
plt.title("Methylation Profile Smoothing")
plt.show()

use fastLowess::prelude::*;
use std::f64::consts::TAU;

fn main() -> Result<(), LowessError> {
    let n = 100usize;
    let x: Vec<f64> = (0..n).map(|i| i as f64 * TAU / (n - 1) as f64).collect();
    let y: Vec<f64> = x.iter().map(|&xi| xi.sin() + 0.1).collect();

    let positions = x.clone();
    let observed = y.clone();

    let model = Lowess::new()
        .fraction(0.1)
        .iterations(3)
        .confidence_intervals(0.95)
        .build()?;

    let result = model.fit(&positions, &observed)?;
    // result.y contains smoothed methylation profile
    // result.confidence_lower/upper contain 95% CI bounds

    Ok(())
}

using FastLOWESS
using Random

rng = MersenneTwister(42)
positions = collect(0.0:10.0:10000.0)
observed = 50.0 .+ sin.(positions ./ 100.0) .* 20.0 .+ randn(rng, length(positions)) .* 5.0

using FastLOWESS

# positions and observed are your methylation data
model = Lowess(;
    fraction=0.1,
    iterations=3,
    confidence_intervals=0.95
)
result = fit(model, positions, observed)

# Smoothed profile in result.y
# CI bounds in result.confidence_lower/upper

const fl = require('fastlowess');

const n = 100;
const x = Float64Array.from({ length: n }, (_, i) => i * 2 * Math.PI / (n - 1));
const y = Float64Array.from(x, (xi, i) => Math.sin(xi) + (((i*7+3)%17)/17-0.5)*0.6);
const positions = Float64Array.from({ length: 1000 }, (_, i) => i * 10.0);
const observed = Float64Array.from(positions, p => 50 + Math.sin(p/100)*20 + Math.random()*5);

// positions and observed are your methylation data (Float64Array)
const model = new fl.Lowess({
    fraction: 0.1,
    iterations: 3,
    confidence_intervals: 0.95
});
const result = model.fit(positions, observed);

// Smoothed profile in result.y
// CI bounds in result.confidence_lower/upper

const { Lowess } = require('fastlowess-wasm');

const n = 100;
const positions = Float64Array.from({ length: n }, (_, i) => i * 100.0);
const observed = Float64Array.from(positions, p => 50 + Math.sin(p / 100) * 20 + ((p * 7 % 17) / 17 - 0.5) * 5);

// positions and observed are your methylation data (Float64Array)
const model = new Lowess({
    fraction: 0.1,
    iterations: 3,
    confidence_intervals: 0.95
});
const result = model.fit(positions, observed);

// Smoothed profile in result.y
// CI bounds in result.confidence_lower/upper

#include <fastlowess.hpp>
#include <cmath>
#include <iostream>
#include <vector>

int main() {
    const int n = 100;
    std::vector<double> positions(n), observed(n);
    for (int i = 0; i < n; ++i) {
        positions[i] = i * 1000.0;
        observed[i] = 50.0 + std::sin(positions[i] / 1000.0) * 20.0 + 5.0;
    }

    // positions and observed are std::vector<double>
    fastlowess::Lowess model({ .fraction = 0.1, .iterations = 3, .confidence_intervals = 0.95 });
    auto result = model.fit(positions, observed).value();

    // Smoothed profile in result.y_vector()
    // CI bounds in result.confidence_lower()/result.confidence_upper()

    return 0;
}

ChIP-seq Signal Smoothing¶

Application¶

ChIP-seq experiments produce sparse, noisy coverage data. LOWESS can help identify binding regions.

RPythonRustJuliaNode.jsWebAssemblyC++

library(rfastlowess)
set.seed(42)
positions <- seq(0, 10000, by = 10)
observed <- 50 + sin(positions / 100) * 20 + rnorm(length(positions), sd = 5)

set.seed(123)
positions <- seq(0, 10000, by = 10)
n <- length(positions)

# Simulate peaks
background <- 10
peak1 <- 50 * exp(-((positions - 2000)^2) / (2 * 200^2))
peak2 <- 80 * exp(-((positions - 5000)^2) / (2 * 300^2))
peak3 <- 40 * exp(-((positions - 8000)^2) / (2 * 150^2))

true_signal <- background + peak1 + peak2 + peak3
observed <- rpois(n, true_signal)

model <- Lowess(
    fraction = 0.05,
    iterations = 5
)
result <- model$fit(positions, observed)

# Find peaks
threshold <- quantile(result$y, 0.75)
peak_positions <- positions[result$y > threshold]

import fastlowess as fl
import numpy as np

np.random.seed(42)
positions = np.arange(0, 10000, 10, dtype=float)
coverage = np.random.poisson(50, len(positions)).astype(float)

# Simulate ChIP-seq coverage with peaks
np.random.seed(123)
positions = np.arange(0, 10000, 10, dtype=float)
n = len(positions)

# Background + peaks
background = 10
peak1 = 50 * np.exp(-((positions - 2000) ** 2) / (2 * 200 ** 2))
peak2 = 80 * np.exp(-((positions - 5000) ** 2) / (2 * 300 ** 2))
peak3 = 40 * np.exp(-((positions - 8000) ** 2) / (2 * 150 ** 2))

true_signal = background + peak1 + peak2 + peak3
observed = np.random.poisson(true_signal)  # Poisson noise

# Smooth with robustness for sporadic high counts
model = fl.Lowess(
    fraction=0.05,   # Very local smoothing
    iterations=5,    # Strong robustness
    return_residuals=True
)
result = model.fit(positions, observed.astype(float))

# Identify peaks (smoothed signal significantly above background)
threshold = np.percentile(result.y, 75)
peaks = positions[result.y > threshold]
print(f"Peak regions: {peaks}")

use fastLowess::prelude::*;
use std::f64::consts::TAU;

fn main() -> Result<(), LowessError> {
    let n = 100usize;
    let x: Vec<f64> = (0..n).map(|i| i as f64 * TAU / (n - 1) as f64).collect();
    let y: Vec<f64> = x.iter().map(|&xi| xi.sin() + 0.1).collect();

    let positions: Vec<f64> = (0..1000).map(|i| i as f64 *10.0).collect(); // 0 to 9990 step 10
    let observed: Vec<f64> = positions.iter().map(|&p| (p / 1000.0).sin().abs()* 100.0 + 10.0).collect();

    let model = Lowess::new()
        .fraction(0.05)
        .iterations(5)
        .return_residuals()
        .build()?;

    let result = model.fit(&positions, &observed)?;

    // Find peaks above threshold
    let threshold = result.y.iter().copied()
        .fold(f64::NEG_INFINITY, f64::max) * 0.75;
    let peak_positions: Vec<f64> = positions.iter().zip(result.y.iter())
        .filter(|(_, &y)| y > threshold)
        .map(|(&p, _)| p)
        .collect();

    Ok(())
}

using FastLOWESS
using Random, Statistics

rng = MersenneTwister(42)
positions = collect(0.0:10.0:10000.0)
observed = 50.0 .+ sin.(positions ./ 100.0) .* 20.0 .+ randn(rng, length(positions)) .* 5.0

using FastLOWESS

# positions and observed are your ChIP-seq data
model = Lowess(; fraction=0.05, iterations=5)
result = fit(model, positions, observed)

# Find peaks above 75th percentile
threshold = quantile(result.y, 0.75)
peak_indices = findall(y -> y > threshold, result.y)
peak_positions = positions[peak_indices]

const fl = require('fastlowess');

const n = 100;
const x = Float64Array.from({ length: n }, (_, i) => i * 2 * Math.PI / (n - 1));
const y = Float64Array.from(x, (xi, i) => Math.sin(xi) + (((i*7+3)%17)/17-0.5)*0.6);
const positions = Float64Array.from({ length: 1000 }, (_, i) => i * 10.0);
const observed = Float64Array.from(positions, p => 50 + Math.sin(p/100)*20 + Math.random()*5);

const model = new fl.Lowess({
    fraction: 0.05,
    iterations: 5
});
const result = model.fit(positions, observed);

// Identify peaks above threshold
const smoothed = result.y;
const threshold = 50.0; // Example threshold
const peaks = positions.filter((p, i) => smoothed[i] > threshold);

const { Lowess } = require('fastlowess-wasm');

const n = 100;
const positions = Float64Array.from({ length: n }, (_, i) => i * 100.0);
const observed = Float64Array.from(positions, p => 50 + Math.sin(p / 100) * 20 + ((p * 7 % 17) / 17 - 0.5) * 5);

const model = new Lowess({
    fraction: 0.05,
    iterations: 5
});
const result = model.fit(positions, observed);

// Find peaks
const smoothed = result.y;
const peaks = positions.filter((p, i) => smoothed[i] > 25.0);

#include <fastlowess.hpp>
#include <cmath>
#include <iostream>
#include <vector>

int main() {
    const int n = 100;
    std::vector<double> positions(n), observed(n);
    for (int i = 0; i < n; ++i) {
        positions[i] = i * 1000.0;
        observed[i] = 50.0 + std::sin(positions[i] / 1000.0) * 20.0 + 5.0;
    }

    fastlowess::Lowess model({ .fraction = 0.05, .iterations = 5 });
    auto result = model.fit(positions, observed).value();

    // Find peaks above threshold
    std::vector<double> peaks;
    const auto& y_vals = result.y_vector();
    const auto& x_vals = result.x_vector();
    for (size_t i = 0; i < y_vals.size(); ++i) {
        if (y_vals[i] > 25.0) {
            peaks.push_back(x_vals[i]);
        }
    }

    return 0;
}

Large Genome Coverage (Streaming)¶

For whole-genome data that doesn't fit in memory:

RPythonRustJuliaNode.jsWebAssemblyC++

library(rfastlowess)
set.seed(42)
positions <- seq(0, 10000, by = 10)
observed <- 50 + sin(positions / 100) * 20 + rnorm(length(positions), sd = 5)
coverage <- observed  # alias: coverage = observed counts

model <- StreamingLowess(
    fraction = 0.05,
    chunk_size = 100000,
    overlap = 10000,
    merge_strategy = "weighted_average"
)
result <- model$process_chunk(positions, coverage)
final <- model$finalize()

import fastlowess as fl
import numpy as np

np.random.seed(42)
positions = np.arange(0, 10000, 10, dtype=float)
coverage = np.random.poisson(50, len(positions)).astype(float)

# Process chromosome-by-chromosome or in chunks
model = fl.StreamingLowess(
    fraction=0.05,
    chunk_size=100000,    # 100kb chunks
    overlap=10000,        # 10kb overlap
    merge_strategy="weighted_average"
)
model.process_chunk(positions, coverage)
result = model.finalize()

use fastLowess::prelude::*;

fn main() -> Result<(), LowessError> {
    let x_chunk: Vec<f64> = (0..1001).map(|i| i as f64 * 10.0).collect();
    let y_chunk: Vec<f64> = x_chunk.iter().map(|&p| 50.0 + (p / 100.0).sin() * 20.0 + 5.0).collect();

    let mut processor = StreamingLowess::new()
        .fraction(0.05)
        .iterations(3)
        .chunk_size(50)
        .overlap(10)
        .merge_strategy("weighted_average")
        .build()?;

    processor.process_chunk(&x_chunk, &y_chunk)?;
    let result = processor.finalize()?;

    Ok(())
}

using FastLOWESS
using Random

rng = MersenneTwister(42)
positions = collect(0.0:10.0:10000.0)
observed = 50.0 .+ sin.(positions ./ 100.0) .* 20.0 .+ randn(rng, length(positions)) .* 5.0
coverage = observed

using FastLOWESS

# coverage and positions are chromosome-scale vectors
model = StreamingLowess(;
    fraction=0.05,
    chunk_size=100000,
    overlap=10000,
    merge_strategy="weighted_average"
)
process_chunk(model, positions, coverage)
result = finalize(model)

const { StreamingLowess } = require('fastlowess');

const positions = Float64Array.from({ length: 1000 }, (_, i) => i * 10.0);
const observed = Float64Array.from(positions, p => 50 + Math.sin(p/100)*20 + Math.random()*5);
// Array of genomic chunks to process
const genomicData = [
    { positions: positions.slice(0, 500), coverage: observed.slice(0, 500) },
    { positions: positions.slice(500), coverage: observed.slice(500) }
];

const processor = new StreamingLowess(
    { fraction: 0.05, iterations: 3 },
    { chunk_size: 100000, overlap: 10000 }
);

// Process genomic chunks from stream or file
for (const chunk of genomicData) {
    processor.process_chunk(chunk.positions, chunk.coverage);
}
const result = processor.finalize();

const { StreamingLowess } = require('fastlowess-wasm');

const xChunk = Float64Array.from({ length: 1001 }, (_, i) => i * 10.0);
const yChunk = Float64Array.from(xChunk, p => 50 + Math.sin(p / 100) * 20 + 5.0);

const processor = new StreamingLowess(
    { fraction: 0.05, iterations: 3 },
    { chunk_size: 100, overlap: 10 }
);

processor.process_chunk(xChunk, yChunk);
const result = processor.finalize();

#include <fastlowess.hpp>
#include <cmath>
#include <iostream>
#include <vector>

int main() {
    const int n = 100;
    std::vector<double> positions(n), coverage(n);
    for (int i = 0; i < n; ++i) {
        positions[i] = i * 1000.0;
        coverage[i] = 50.0 + std::sin(positions[i] / 1000.0) * 20.0 + 5.0;
    }

    // coverage and positions are chromosome-scale vectors
    fastlowess::StreamingOptions s_opts;
    s_opts.fraction = 0.05;
    s_opts.iterations = 3;
    s_opts.chunk_size = 100000;
    s_opts.overlap = 10000;
    fastlowess::StreamingLowess stream(s_opts);
    (void)stream.process_chunk(positions, coverage);
    auto result = stream.finalize().value();

    return 0;
}

Best Practices for Genomic Data¶

Consideration	Recommendation
Fraction	0.05–0.15 (preserve local features)
Iterations	3–5 (handle sequencing outliers)
Large data	Use streaming mode
Sparse regions	Use `boundary_policy="extend"`
Multiple chromosomes	Process separately or ensure sorted

Genomic Data Smoothing¶

Overview¶

Methylation Profile Smoothing¶

The Challenge¶

Solution¶

ChIP-seq Signal Smoothing¶

Application¶

Large Genome Coverage (Streaming)¶

Best Practices for Genomic Data¶

See Also¶