How to Experiment with Observation Windows

Change the date range and density of observations, then compare how the Neptune search performs.

Author

Jonathan Whitmore

Published

April 3, 2026

Introduction

This guide shows how to modify the observation window used in the Neptune search and compare optimizer results across different subsets. You will truncate the date range, subsample the observations, and see why time span matters more than observation density.

Setup

Imports and baseline data

from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from discoverneptune.data import find_bundled_data_dir
from scipy.optimize import differential_evolution

data_dir = find_bundled_data_dir()

observations = pd.read_csv(data_dir / "uranus_observations_1781_1846.csv")
outer_planets = pd.read_csv(data_dir / "outer_planet_state_vectors_1781_01_01.csv")

observation_times_jd = observations["datetime_jd"].values.astype(float)
observed_longitudes_rad = observations["longitude_rad"].values.astype(float)
epoch_jd = float(observation_times_jd[0])

DAYS_PER_YEAR = 365.25
NEPTUNE_TRUE_A = 30.07  # AU

obs_years = (observation_times_jd - epoch_jd) / DAYS_PER_YEAR + 1781.0

print(f"Loaded {len(observations)} observations spanning {obs_years[0]:.0f}–{obs_years[-1]:.0f}")

Loaded 66 observations spanning 1781–1846

Core functions

These functions are repeated from the tutorial so this document is self-contained. Expand the block below to see the full implementation.

Note

Tutorial-scale configuration. The optimizer calls below use a reduced DE budget (maxiter=30, popsize=5) so the document renders quickly. The production solver in src/discoverneptune/solver.py uses maxiter=300, popsize=15 by default. The sweep results below are qualitatively correct but can shift by a few AU if you rerun with the full budget.

build_simulation(), simulate_uranus(), and run_search()

import rebound

PLANET_MASSES = {
    "jupiter": 1 / 1047.35,
    "saturn": 1 / 3501.6,
    "uranus": 1 / 22869.0,
}


def build_simulation(
    neptune_params: dict[str, float] | None = None,
) -> rebound.Simulation:
    """Create a REBOUND simulation of the outer solar system.

    Args:
        neptune_params: If provided, must contain keys "mass_solar", "a",
            "e", and "l_rad" (mean longitude in radians).

    Returns:
        A configured REBOUND Simulation ready to integrate.
    """
    sim = rebound.Simulation()
    sim.units = ("yr", "AU", "Msun")

    # Sun
    sim.add(m=1.0)

    # Jupiter, Saturn, Uranus from JPL state vectors
    for row in outer_planets.itertuples(index=False):
        sim.add(
            m=PLANET_MASSES[row.planet],
            x=float(row.x),
            y=float(row.y),
            z=float(row.z),
            vx=float(row.vx) * DAYS_PER_YEAR,
            vy=float(row.vy) * DAYS_PER_YEAR,
            vz=float(row.vz) * DAYS_PER_YEAR,
            hash=row.planet.title(),
        )

    # Optional Neptune candidate (mean longitude via l=, NOT M=)
    if neptune_params is not None:
        sim.add(
            m=neptune_params["mass_solar"],
            a=neptune_params["a"],
            e=neptune_params["e"],
            l=neptune_params["l_rad"],
            hash="Neptune",
        )

    sim.move_to_com()
    sim.integrator = "whfast"
    sim.dt = 0.05
    return sim


def simulate_uranus(
    times_jd: np.ndarray,
    neptune_params: dict[str, float] | None = None,
) -> np.ndarray:
    """Integrate and return Uranus heliocentric longitudes.

    Args:
        times_jd: Julian dates at which to extract longitudes.
        neptune_params: Optional Neptune candidate parameters.

    Returns:
        Array of longitudes (radians) at each observation epoch.
    """
    sim = build_simulation(neptune_params)
    longitudes = np.empty(len(times_jd))

    for i, jd in enumerate(times_jd):
        t_yr = (jd - epoch_jd) / DAYS_PER_YEAR
        sim.integrate(t_yr)
        sun = sim.particles[0]
        uranus = sim.particles["Uranus"]
        longitudes[i] = np.arctan2(
            uranus.y - sun.y, uranus.x - sun.x
        )

    return longitudes


def run_search(
    times_jd: np.ndarray,
    obs_lon_rad: np.ndarray,
) -> dict:
    """Run differential evolution to find Neptune.

    Args:
        times_jd: Julian dates of the observations to fit.
        obs_lon_rad: Observed longitudes (radians) at those dates.

    Returns:
        Dict with keys "params", "rms_arcsec", and "converged".
    """

    def objective(x: np.ndarray) -> float:
        params = {"mass_solar": x[0], "a": x[1], "e": x[2], "l_rad": x[3]}
        sim_lon = simulate_uranus(times_jd, neptune_params=params)
        res_rad = (obs_lon_rad - sim_lon + np.pi) % (2 * np.pi) - np.pi
        return float(np.sum(res_rad**2))

    bounds = [
        (1e-5, 1e-4),
        (25.0, 45.0),
        (0.0, 0.15),
        (0.0, 2 * np.pi),
    ]

    result = differential_evolution(
        objective,
        bounds=bounds,
        maxiter=30,
        popsize=5,
        seed=42,
        workers=1,
        updating="deferred",
        polish=True,
    )

    best = {
        "mass_solar": result.x[0],
        "a": result.x[1],
        "e": result.x[2],
        "l_rad": result.x[3],
    }

    sim_lon = simulate_uranus(times_jd, neptune_params=best)
    res_rad = (obs_lon_rad - sim_lon + np.pi) % (2 * np.pi) - np.pi
    rms = float(np.sqrt(np.mean(np.degrees(res_rad) ** 2)) * 3600.0)

    return {"params": best, "rms_arcsec": rms, "converged": result.success}


print("Core functions defined.")

Core functions defined.

Change the end year

How much historical data does the optimizer need? We truncate the observations at progressively earlier cutoff years and run the search each time.

Run optimizer for different end-year cutoffs

cutoff_years = [1800, 1810, 1820, 1830, 1840, 1846]
cutoff_results = []

for cutoff in cutoff_years:
    mask = obs_years <= cutoff
    n_obs = int(mask.sum())
    res = run_search(observation_times_jd[mask], observed_longitudes_rad[mask])
    cutoff_results.append({
        "cutoff_year": cutoff,
        "n_obs": n_obs,
        "a_AU": res["params"]["a"],
        "mass_solar": res["params"]["mass_solar"],
        "rms_arcsec": res["rms_arcsec"],
        "converged": res["converged"],
    })
    print(f"Cutoff {cutoff}: n={n_obs}, a={res['params']['a']:.2f} AU, RMS={res['rms_arcsec']:.1f}″")

cutoff_df = pd.DataFrame(cutoff_results)
cutoff_df

Cutoff 1800: n=20, a=33.45 AU, RMS=0.2″
Cutoff 1810: n=30, a=29.23 AU, RMS=0.4″
Cutoff 1820: n=40, a=42.32 AU, RMS=1.0″
Cutoff 1830: n=50, a=38.07 AU, RMS=2.7″
Cutoff 1840: n=60, a=29.13 AU, RMS=3.0″
Cutoff 1846: n=66, a=30.14 AU, RMS=2.9″

	cutoff_year	n_obs	a_AU	mass_solar	rms_arcsec	converged
0	1800	20	33.450885	0.000055	0.193252	False
1	1810	30	29.229425	0.000025	0.354425	False
2	1820	40	42.316930	0.000078	0.964817	False
3	1830	50	38.071348	0.000094	2.651909	False
4	1840	60	29.133569	0.000039	2.982673	False
5	1846	66	30.140404	0.000044	2.868461	False

Subsample observations

Now keep the full 65-year window but take every Nth observation, reducing density while preserving the time baseline.

Run optimizer for different subsample steps

steps = [1, 2, 3, 5, 10]
subsample_results = []

for step in steps:
    idx = np.arange(0, len(observation_times_jd), step)
    # Always include the last observation to preserve the full time span
    if idx[-1] != len(observation_times_jd) - 1:
        idx = np.append(idx, len(observation_times_jd) - 1)
    times_sub = observation_times_jd[idx]
    lon_sub = observed_longitudes_rad[idx]
    res = run_search(times_sub, lon_sub)
    subsample_results.append({
        "step": step,
        "n_obs": len(idx),
        "a_AU": res["params"]["a"],
        "mass_solar": res["params"]["mass_solar"],
        "rms_arcsec": res["rms_arcsec"],
        "converged": res["converged"],
    })
    print(f"Every {step}th: n={len(idx)}, a={res['params']['a']:.2f} AU, RMS={res['rms_arcsec']:.1f}″")

subsample_df = pd.DataFrame(subsample_results)
subsample_df

Every 1th: n=66, a=30.14 AU, RMS=2.9″
Every 2th: n=34, a=31.55 AU, RMS=2.6″
Every 3th: n=23, a=30.76 AU, RMS=2.7″
Every 5th: n=14, a=31.03 AU, RMS=2.7″
Every 10th: n=8, a=32.93 AU, RMS=2.3″

	step	n_obs	a_AU	mass_solar	rms_arcsec	converged
0	1	66	30.140404	0.000044	2.868461	False
1	2	34	31.553943	0.000055	2.575373	False
2	3	23	30.761460	0.000050	2.723266	False
3	5	14	31.033011	0.000052	2.671253	False
4	10	8	32.928287	0.000068	2.312139	False

Visualize

Two-panel comparison of recovered semi-major axis

def _build(theme="light"):
    apply_style(theme)
    pal = palette(theme)
    cycle = category_cycle(theme)
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=fig_size("double"))

    # Left panel: cutoff year
    ax1.plot(cutoff_df["cutoff_year"], cutoff_df["a_AU"], "o-", color=cycle[0])
    truth_line(ax1, NEPTUNE_TRUE_A, label=f"True a = {NEPTUNE_TRUE_A} AU", axis="y", theme=theme)
    ax1.set_xlabel("Cutoff year")
    ax1.set_ylabel("Recovered semi-major axis (AU)")
    ax1.set_title("Effect of observation span")

    # Right panel: number of observations (subsampled)
    ax2.plot(subsample_df["n_obs"], subsample_df["a_AU"], "s-", color=cycle[1])
    truth_line(ax2, NEPTUNE_TRUE_A, label=f"True a = {NEPTUNE_TRUE_A} AU", axis="y", theme=theme)
    ax2.set_xlabel("Number of observations")
    ax2.set_ylabel("Recovered semi-major axis (AU)")
    ax2.set_title("Effect of observation density")

    return fig

display(dual_render(_build, alt="Recovered Neptune semi-major axis vs observation span and density, with true value reference line"))

Figure 1: Recovered semi-major axis vs cutoff year (left) and number of observations (right). The dashed line marks Neptune’s true a = 30.07 AU.

Why time span matters more than density

With fewer than about 40 years of data the optimizer has too little leverage on Neptune’s slow gravitational pull and tends to overfit, pushing the recovered semi-major axis away from the true value. Subsampling, by contrast, removes observations but preserves the full time baseline, and the results remain stable even with only 7-8 data points spread across 65 years. The lesson is that it is the duration of the perturbation signal, not the number of measurements, that constrains the inverse problem. This is a direct consequence of the mass-distance degeneracy: without a long baseline the optimizer cannot distinguish a nearby light perturber from a distant heavy one.