How to Experiment with Observation Windows

Change the date range and density of observations, then compare how the Neptune search performs.
Author

Jonathan Whitmore

Published

April 3, 2026

Introduction

This guide shows how to modify the observation window used in the Neptune search and compare optimizer results across different subsets. You will truncate the date range, subsample the observations, and see why time span matters more than observation density.

Setup

Imports and baseline data
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from discoverneptune.data import find_bundled_data_dir
from scipy.optimize import differential_evolution

data_dir = find_bundled_data_dir()

observations = pd.read_csv(data_dir / "uranus_observations_1781_1846.csv")
outer_planets = pd.read_csv(data_dir / "outer_planet_state_vectors_1781_01_01.csv")

observation_times_jd = observations["datetime_jd"].values.astype(float)
observed_longitudes_rad = observations["longitude_rad"].values.astype(float)
epoch_jd = float(observation_times_jd[0])

DAYS_PER_YEAR = 365.25
NEPTUNE_TRUE_A = 30.07  # AU

obs_years = (observation_times_jd - epoch_jd) / DAYS_PER_YEAR + 1781.0

print(f"Loaded {len(observations)} observations spanning {obs_years[0]:.0f}{obs_years[-1]:.0f}")
Loaded 66 observations spanning 1781–1846

Core functions

These functions are repeated from the tutorial so this document is self-contained. Expand the block below to see the full implementation.

Note

Tutorial-scale configuration. The optimizer calls below use a reduced DE budget (maxiter=30, popsize=5) so the document renders quickly. The production solver in src/discoverneptune/solver.py uses maxiter=300, popsize=15 by default. The sweep results below are qualitatively correct but can shift by a few AU if you rerun with the full budget.

build_simulation(), simulate_uranus(), and run_search()
import rebound

PLANET_MASSES = {
    "jupiter": 1 / 1047.35,
    "saturn": 1 / 3501.6,
    "uranus": 1 / 22869.0,
}


def build_simulation(
    neptune_params: dict[str, float] | None = None,
) -> rebound.Simulation:
    """Create a REBOUND simulation of the outer solar system.

    Args:
        neptune_params: If provided, must contain keys "mass_solar", "a",
            "e", and "l_rad" (mean longitude in radians).

    Returns:
        A configured REBOUND Simulation ready to integrate.
    """
    sim = rebound.Simulation()
    sim.units = ("yr", "AU", "Msun")

    # Sun
    sim.add(m=1.0)

    # Jupiter, Saturn, Uranus from JPL state vectors
    for row in outer_planets.itertuples(index=False):
        sim.add(
            m=PLANET_MASSES[row.planet],
            x=float(row.x),
            y=float(row.y),
            z=float(row.z),
            vx=float(row.vx) * DAYS_PER_YEAR,
            vy=float(row.vy) * DAYS_PER_YEAR,
            vz=float(row.vz) * DAYS_PER_YEAR,
            hash=row.planet.title(),
        )

    # Optional Neptune candidate (mean longitude via l=, NOT M=)
    if neptune_params is not None:
        sim.add(
            m=neptune_params["mass_solar"],
            a=neptune_params["a"],
            e=neptune_params["e"],
            l=neptune_params["l_rad"],
            hash="Neptune",
        )

    sim.move_to_com()
    sim.integrator = "whfast"
    sim.dt = 0.05
    return sim


def simulate_uranus(
    times_jd: np.ndarray,
    neptune_params: dict[str, float] | None = None,
) -> np.ndarray:
    """Integrate and return Uranus heliocentric longitudes.

    Args:
        times_jd: Julian dates at which to extract longitudes.
        neptune_params: Optional Neptune candidate parameters.

    Returns:
        Array of longitudes (radians) at each observation epoch.
    """
    sim = build_simulation(neptune_params)
    longitudes = np.empty(len(times_jd))

    for i, jd in enumerate(times_jd):
        t_yr = (jd - epoch_jd) / DAYS_PER_YEAR
        sim.integrate(t_yr)
        sun = sim.particles[0]
        uranus = sim.particles["Uranus"]
        longitudes[i] = np.arctan2(
            uranus.y - sun.y, uranus.x - sun.x
        )

    return longitudes


def run_search(
    times_jd: np.ndarray,
    obs_lon_rad: np.ndarray,
) -> dict:
    """Run differential evolution to find Neptune.

    Args:
        times_jd: Julian dates of the observations to fit.
        obs_lon_rad: Observed longitudes (radians) at those dates.

    Returns:
        Dict with keys "params", "rms_arcsec", and "converged".
    """

    def objective(x: np.ndarray) -> float:
        params = {"mass_solar": x[0], "a": x[1], "e": x[2], "l_rad": x[3]}
        sim_lon = simulate_uranus(times_jd, neptune_params=params)
        res_rad = (obs_lon_rad - sim_lon + np.pi) % (2 * np.pi) - np.pi
        return float(np.sum(res_rad**2))

    bounds = [
        (1e-5, 1e-4),
        (25.0, 45.0),
        (0.0, 0.15),
        (0.0, 2 * np.pi),
    ]

    result = differential_evolution(
        objective,
        bounds=bounds,
        maxiter=30,
        popsize=5,
        seed=42,
        workers=1,
        updating="deferred",
        polish=True,
    )

    best = {
        "mass_solar": result.x[0],
        "a": result.x[1],
        "e": result.x[2],
        "l_rad": result.x[3],
    }

    sim_lon = simulate_uranus(times_jd, neptune_params=best)
    res_rad = (obs_lon_rad - sim_lon + np.pi) % (2 * np.pi) - np.pi
    rms = float(np.sqrt(np.mean(np.degrees(res_rad) ** 2)) * 3600.0)

    return {"params": best, "rms_arcsec": rms, "converged": result.success}


print("Core functions defined.")
Core functions defined.

Change the end year

How much historical data does the optimizer need? We truncate the observations at progressively earlier cutoff years and run the search each time.

Run optimizer for different end-year cutoffs
cutoff_years = [1800, 1810, 1820, 1830, 1840, 1846]
cutoff_results = []

for cutoff in cutoff_years:
    mask = obs_years <= cutoff
    n_obs = int(mask.sum())
    res = run_search(observation_times_jd[mask], observed_longitudes_rad[mask])
    cutoff_results.append({
        "cutoff_year": cutoff,
        "n_obs": n_obs,
        "a_AU": res["params"]["a"],
        "mass_solar": res["params"]["mass_solar"],
        "rms_arcsec": res["rms_arcsec"],
        "converged": res["converged"],
    })
    print(f"Cutoff {cutoff}: n={n_obs}, a={res['params']['a']:.2f} AU, RMS={res['rms_arcsec']:.1f}″")

cutoff_df = pd.DataFrame(cutoff_results)
cutoff_df
Cutoff 1800: n=20, a=33.45 AU, RMS=0.2″
Cutoff 1810: n=30, a=29.23 AU, RMS=0.4″
Cutoff 1820: n=40, a=42.32 AU, RMS=1.0″
Cutoff 1830: n=50, a=38.07 AU, RMS=2.7″
Cutoff 1840: n=60, a=29.13 AU, RMS=3.0″
Cutoff 1846: n=66, a=30.14 AU, RMS=2.9″
cutoff_year n_obs a_AU mass_solar rms_arcsec converged
0 1800 20 33.450885 0.000055 0.193252 False
1 1810 30 29.229425 0.000025 0.354425 False
2 1820 40 42.316930 0.000078 0.964817 False
3 1830 50 38.071348 0.000094 2.651909 False
4 1840 60 29.133569 0.000039 2.982673 False
5 1846 66 30.140404 0.000044 2.868461 False

Subsample observations

Now keep the full 65-year window but take every Nth observation, reducing density while preserving the time baseline.

Run optimizer for different subsample steps
steps = [1, 2, 3, 5, 10]
subsample_results = []

for step in steps:
    idx = np.arange(0, len(observation_times_jd), step)
    # Always include the last observation to preserve the full time span
    if idx[-1] != len(observation_times_jd) - 1:
        idx = np.append(idx, len(observation_times_jd) - 1)
    times_sub = observation_times_jd[idx]
    lon_sub = observed_longitudes_rad[idx]
    res = run_search(times_sub, lon_sub)
    subsample_results.append({
        "step": step,
        "n_obs": len(idx),
        "a_AU": res["params"]["a"],
        "mass_solar": res["params"]["mass_solar"],
        "rms_arcsec": res["rms_arcsec"],
        "converged": res["converged"],
    })
    print(f"Every {step}th: n={len(idx)}, a={res['params']['a']:.2f} AU, RMS={res['rms_arcsec']:.1f}″")

subsample_df = pd.DataFrame(subsample_results)
subsample_df
Every 1th: n=66, a=30.14 AU, RMS=2.9″
Every 2th: n=34, a=31.55 AU, RMS=2.6″
Every 3th: n=23, a=30.76 AU, RMS=2.7″
Every 5th: n=14, a=31.03 AU, RMS=2.7″
Every 10th: n=8, a=32.93 AU, RMS=2.3″
step n_obs a_AU mass_solar rms_arcsec converged
0 1 66 30.140404 0.000044 2.868461 False
1 2 34 31.553943 0.000055 2.575373 False
2 3 23 30.761460 0.000050 2.723266 False
3 5 14 31.033011 0.000052 2.671253 False
4 10 8 32.928287 0.000068 2.312139 False

Visualize

Two-panel comparison of recovered semi-major axis
def _build(theme="light"):
    apply_style(theme)
    pal = palette(theme)
    cycle = category_cycle(theme)
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=fig_size("double"))

    # Left panel: cutoff year
    ax1.plot(cutoff_df["cutoff_year"], cutoff_df["a_AU"], "o-", color=cycle[0])
    truth_line(ax1, NEPTUNE_TRUE_A, label=f"True a = {NEPTUNE_TRUE_A} AU", axis="y", theme=theme)
    ax1.set_xlabel("Cutoff year")
    ax1.set_ylabel("Recovered semi-major axis (AU)")
    ax1.set_title("Effect of observation span")

    # Right panel: number of observations (subsampled)
    ax2.plot(subsample_df["n_obs"], subsample_df["a_AU"], "s-", color=cycle[1])
    truth_line(ax2, NEPTUNE_TRUE_A, label=f"True a = {NEPTUNE_TRUE_A} AU", axis="y", theme=theme)
    ax2.set_xlabel("Number of observations")
    ax2.set_ylabel("Recovered semi-major axis (AU)")
    ax2.set_title("Effect of observation density")

    return fig

display(dual_render(_build, alt="Recovered Neptune semi-major axis vs observation span and density, with true value reference line"))
Figure 1: Recovered semi-major axis vs cutoff year (left) and number of observations (right). The dashed line marks Neptune’s true a = 30.07 AU.

Why time span matters more than density

With fewer than about 40 years of data the optimizer has too little leverage on Neptune’s slow gravitational pull and tends to overfit, pushing the recovered semi-major axis away from the true value. Subsampling, by contrast, removes observations but preserves the full time baseline, and the results remain stable even with only 7-8 data points spread across 65 years. The lesson is that it is the duration of the perturbation signal, not the number of measurements, that constrains the inverse problem. This is a direct consequence of the mass-distance degeneracy: without a long baseline the optimizer cannot distinguish a nearby light perturber from a distant heavy one.