diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5569289..8570829 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -29,4 +29,11 @@
 - Added elevation calculation
 - Removed watershed ID attribute from exported rasters (now includes elevation instead). You can still compute watershed IDs on the graph representation if needed.
 - Updated demo notebook
-- Minor bugfixes
\ No newline at end of file
+- Minor bugfixes
+
+# 1.6.20251210
+- more complete documentation
+- minor bugfixes and improvements
+- n_iterations now defaults to an empirical equation that depends on constant_phase and cooling_rate to achieve more reliable convergence.
+- fixed a bug in convergence detection.
+- improved convergence detection.
\ No newline at end of file
diff --git a/PyOCN/_version.py b/PyOCN/_version.py
index d8d84df..90a2620 100644
--- a/PyOCN/_version.py
+++ b/PyOCN/_version.py
@@ -1 +1 @@
-__version__ = "1.5.20251121"
\ No newline at end of file
+__version__ = "1.6.20251210"
\ No newline at end of file
diff --git a/PyOCN/ocn.py b/PyOCN/ocn.py
index 1009900..e309525 100644
--- a/PyOCN/ocn.py
+++ b/PyOCN/ocn.py
@@ -7,7 +7,6 @@
 
 import networkx as nx 
 import numpy as np
-import rasterio
 from tqdm import tqdm
 
 from ._statushandler import check_status
@@ -17,6 +16,7 @@
 
 if TYPE_CHECKING:
     import xarray as xr
+    import rasterio
 
 """
 High-level Optimized Channel Network (OCN) interface.
@@ -95,7 +95,7 @@ class OCN:
     gamma : float
         Exponent in the energy model.
     verbosity : int
-        Verbosity level for underlying library output (0-2).
+            Verbosity level (0-2). 0 is silent, 1 prints high-level progress messages, 2 prints detailed logging messages. Can be overridden in methods that support it.
     wrap : bool
         If true, enables periodic boundary conditions on the FlowGrid (read-only property).
     history : np.ndarray
@@ -149,7 +149,7 @@ def __init__(self, dag: nx.DiGraph, resolution: float=1.0, gamma: float = 0.5, r
         self.__p_c_graph = fgconv.from_digraph(  # does most of the work 
             dag, 
             resolution, 
-            verbose=(verbosity > 1), 
+            verbose=(verbosity == 2), 
             validate=validate, 
             wrap=wrap
         )
@@ -181,8 +181,8 @@ def from_net_type(cls, net_type:str, dims:tuple[int, int], resolution:float=1, g
             Exponent in the energy model.
         random_state : int | numpy.random.Generator | None, optional
             Seed or generator for RNG seeding.
-        verbosity : int, default 0
-            Verbosity level (0-2) for underlying library output.
+        verbosity : int
+            Verbosity level (0-2). 0 is silent, 1 prints high-level progress messages, 2 prints detailed logging messages. Can be overridden in methods that support it.
         wrap : bool, default False
             If true, allows wrapping around the edges of the grid (toroidal). If false, no wrapping is applied.
         vertical_exaggeration : float, default 1.0
@@ -202,10 +202,10 @@ def from_net_type(cls, net_type:str, dims:tuple[int, int], resolution:float=1, g
         ):
             raise ValueError(f"dims must be a tuple of two positive integers, got {dims}")
         
-        if verbosity == 1:
+        if verbosity == 2:
             print(f"Creating {net_type} network DiGraph with dimensions {dims}...", end="")
         dag = net_type_to_dag(net_type, dims)
-        if verbosity == 1:
+        if verbosity == 2:
             print(" Done.")
         
         # no need to validate inputs when using a predefined net_type. Saves time.
@@ -226,8 +226,8 @@ def from_digraph(cls, dag: nx.DiGraph, resolution:float=1, gamma=0.5, random_sta
             Exponent in the energy model.
         random_state : int | numpy.random.Generator | None, optional
             Seed or generator for RNG seeding.
-        verbosity : int, default 0
-            Verbosity level (0-2) for underlying library output.
+        verbosity : int
+            Verbosity level (0-2). 0 is silent, 1 prints high-level progress messages, 2 prints detailed logging messages. Can be overridden in methods that support it.
         wrap : bool, default False
             If true, allows wrapping around the edges of the grid (toroidal). If false, no wrapping is applied.
         vertical_exaggeration : float, default 1.0
@@ -386,6 +386,8 @@ def rng(self, random_state:int|None|np.random.Generator=None):
 
     @property
     def history(self) -> np.ndarray:
+        """The optimization history as a numpy array of shape (n_iterations, 3).
+        Each row corresponds to an iteration. The columns are iteration index, energy, and temperature."""
         return self.__history
 
     ###########################
@@ -799,7 +801,7 @@ def fit(
         n_iterations:int=None,
         pbar:bool=False,
         array_reports:int=0,
-        tol:float=None,
+        tol:float | None=None,
         max_iterations_per_loop=10_000,
         unwrap:bool=True,
         calculate_full_energy:bool=False) -> "xr.Dataset | None":
@@ -828,8 +830,9 @@ def fit(
             from the initial temperature. A value of 1.0 means the temperature is held
             constant for the entire optimization.
         n_iterations : int, optional
-            Total number of iterations. Defaults to ``40 * rows * cols``.
-            Always at least ``energy_reports * 10`` (this should only matter for
+            defaults to ``int((63 * exp(-0.448 * cooling_rate)) * rows * cols * (1 + constant_phase))``, which was empirically found to work well across a range of cooling rates.
+            When ``constant_phase = 1`` and ``constant_phase = 0``, this reduces to ``40 * rows * cols``.
+            Clamped to at least ``energy_reports * 10`` (this should only matter for
             extremely small grids, where ``rows * cols < 256``).
         pbar : bool, default True
             Whether to display a progress bar.
@@ -838,11 +841,10 @@ def fit(
             If 0 (default), returns None. If >0, returns an xarray.Dataset
             containing the state of the FlowGrid at approximately evenly spaced intervals
             throughout the optimization, including the initial and final states. Requires xarray to be installed. See notes on xarray output for details.
-        tol : float, optional
-            If provided, optimization will stop early if the relative change
-            in energy between reports is less than `tol`. Must be positive.
-            If None (default), no early stopping is performed.
-            Recommended values are in the range 1e-4 to 1e-6.
+        tol : float | None, optional
+            If provided, optimization will stop early if the average relative energy reduction per iteration is less than `tol` for two consecutive checks. Must be positive.
+            If None, no early stopping is performed.
+            Recommended values are in the range 1e-9 to 1e-6 per iteration. A good default is 3.2e-8.
         max_iterations_per_loop: int, optional
             If provided, the number of iterations steps to perform in each "chunk"
             of optimization. Energy and output arrays can be reported no more often
@@ -899,31 +901,31 @@ def fit(
         The proposal is accepted with the probability
         
         .. math::
-            P(\\text{accept}) = e^{-\Delta E / T},
+            P(\\text{accept}) = e^{-\\Delta E / T},
 
-        where :math:`\Delta E` is the change in energy the change would cause 
+        where :math:`\\Delta E` is the change in energy the change would cause 
         and :math:`T` is the temperature of the network.
 
         The total energy of the system is computed from the drained areas of each grid cell :math:`k` as
 
         .. math::
-            E = \sum_k A_k^\gamma
+            E = \\sum_k A_k^\\gamma
 
         The temperature of the network is governed by a cooling schedule, which is a function of iteration index.
         
-        Note that when :math:`\Delta E < 0`, the move is always accepted.
+        Note that when :math:`\\Delta E < 0`, the move is always accepted.
 
         The cooling schedule used by this method is a piecewise function of iteration index:
         
         .. math::
             T(i) = \\begin{cases}
-                E_0 & i < C N \\
-                E_0 \cdot e^{\;i - C N} & i \ge C N
-            \end{cases}
+                E_0 & i < C N \\\\
+                E_0 \\cdot e^{i - C N} & i \\ge C N
+            \\end{cases}
 
         where :math:`E_0` is the initial energy, :math:`N` is the total number
         of iterations, and :math:`C` is ``constant_phase``. Decreasing-energy
-        moves (:math:`\Delta E < 0`) are always accepted.
+        moves (:math:`\\Delta E < 0`) are always accepted.
 
         Alternative cooling schedules can be implemented using :meth:`fit_custom_cooling`.
         """
@@ -935,7 +937,7 @@ def fit(
         if cooling_rate is None:
             cooling_rate = 1.0
         if n_iterations is None:
-            n_iterations = 40 * self.dims[0] * self.dims[1]
+            n_iterations = int((63 * np.exp(-0.448 * cooling_rate)) * self.dims[0] * self.dims[1] * (1 + constant_phase))
 
         # create a cooling schedule from arguments
         cooling_func = simulated_annealing_schedule(
@@ -964,7 +966,7 @@ def fit_custom_cooling(
         iteration_start:int=0,
         pbar:bool=False,
         array_reports:int=0,
-        tol:float=None,
+        tol:float | None=None,
         max_iterations_per_loop=10_000,
         unwrap:bool=True,
         calculate_full_energy:bool=False,
@@ -1068,6 +1070,8 @@ def fit_custom_cooling(
             disable=not (pbar or self.verbosity >= 1)
         )
         
+        consecutive_convergence_required = 2
+        consecutive_convergence_tests_passed = 0
         while completed_iterations < n_iterations:
             iterations_this_loop = min(max_iterations_per_loop, n_iterations - completed_iterations)
             anneal_buf[:iterations_this_loop] = cooling_func(
@@ -1110,35 +1114,37 @@ def fit_custom_cooling(
                 END = '\033[0m'
                 T_over_E = anneal_buf[iterations_this_loop - 1]/e_old*100
                 ToE_str = f"{int(np.floor(T_over_E)):02d}.{int((T_over_E - np.floor(T_over_E))*100):02d}%"
-                de_over_E = (e_new - e_old)/e_old*100
-
+                de_over_E = (e_new - e_old)/e_old / iterations_this_loop
                 dEoE_sign = '+' if de_over_E >= 0 else '-'
-                dEoE_integer_part = int(np.floor(np.abs(de_over_E)))
-                dEoE_fractional_part = int((np.abs(de_over_E) - dEoE_integer_part)*100)
-                dEoE_str = f"{dEoE_sign}{dEoE_integer_part:02d}.{dEoE_fractional_part:02d}%"
+                dEoE_str = f"{dEoE_sign}{np.abs(de_over_E):.0e}"
+                
                 if T_over_E > 50: ToE_str = RED + ToE_str + END
                 elif T_over_E > 5: ToE_str = YELLOW + ToE_str + END
                 else: ToE_str = CYAN + ToE_str + END
-                if de_over_E > 1: dEoE_str = RED + dEoE_str + END
-                elif de_over_E > -1: dEoE_str = YELLOW + dEoE_str + END
+                if de_over_E > 1e-7: dEoE_str = RED + dEoE_str + END
+                elif de_over_E > -1e-7: dEoE_str = YELLOW + dEoE_str + END
                 else: dEoE_str = CYAN + dEoE_str + END
 
                 pbar.set_postfix({
                     "E": f"{self.energy:.1e}", 
                     "T/E": ToE_str,
-                    "ΔE/E": dEoE_str,
+                    # "ΔE/E/iter": dEoE_str,
+                    "ΔE/E/iter": dEoE_str,
                 })
                 pbar.update(iterations_this_loop)
 
             # check for convergence if requested
-            if (
-                (tol is not None) 
-                and (e_new <= e_old) 
-                and (abs((e_old - e_new)/e_old) if e_old > 0 else np.inf < tol)
-            ):
-                if self.verbosity > 1:
-                    print("Convergence reached, stopping optimization.")
-                break
+            can_check_convergence = (tol is not None) and (completed_iterations >= 0.333 * n_iterations)
+            improved_score = e_new <= e_old
+            relative_improvement_per_iteration = abs((e_old - e_new) / e_old)/iterations_this_loop if e_old > 0 else np.inf
+            if can_check_convergence and improved_score and (relative_improvement_per_iteration < tol):
+                consecutive_convergence_tests_passed += 1
+                if consecutive_convergence_tests_passed >= consecutive_convergence_required:
+                    if self.verbosity >= 1:
+                        print("Convergence reached, stopping optimization.")
+                    break
+            else:
+                consecutive_convergence_tests_passed = 0
         
         pbar.close()
         
diff --git a/PyOCN/plotting.py b/PyOCN/plotting.py
index b7ba57b..9b1323e 100644
--- a/PyOCN/plotting.py
+++ b/PyOCN/plotting.py
@@ -16,14 +16,20 @@
 from typing import Any, TYPE_CHECKING, Literal
 
 import numpy as np
-import matplotlib.pyplot as plt
 import networkx as nx
 import warnings
+import matplotlib.pyplot as plt
 
 if TYPE_CHECKING:
     from .ocn import OCN
 from .utils import unwrap_digraph
 
+def _get_plt():
+    import importlib
+    if not importlib.util.find_spec("matplotlib"):
+        raise ImportError("matplotlib is required for plotting functions.")
+    return importlib.import_module("matplotlib.pyplot")
+
 def _pos_to_xy(dag: nx.DiGraph, nrows=None) -> dict[Any, tuple[float, float]]:
     """
     Convert node ``pos`` from (row, col) to plotting coordinates (x, y).
@@ -88,6 +94,7 @@ def plot_ocn_as_dag(ocn: OCN, attribute: str | None = None, ax=None, norm=None,
     pos = _pos_to_xy(dag, nrows=ocn.dims[0])
 
     if ax is None:
+        plt = _get_plt()
         _, ax = plt.subplots()
 
     node_color = "C0"
@@ -143,6 +150,7 @@ def plot_ocn_raster(ocn: OCN, attribute:Literal['energy', 'drained_area', 'eleva
         kwargs["cmap"] = "terrain"
         
     if ax is None:
+        plt = _get_plt()
         _, ax = plt.subplots()
 
     im = ax.imshow(array, **kwargs)
@@ -176,6 +184,7 @@ def plot_positional_digraph(dag: nx.DiGraph, ax=None, nrows:int|None=None, **kwa
     pos = _pos_to_xy(dag, nrows)
 
     if ax is None:
+        plt = _get_plt()
         _, ax = plt.subplots()
 
     p = nx.draw_networkx(dag, pos=pos, ax=ax, **kwargs)
diff --git a/PyOCN/utils.py b/PyOCN/utils.py
index 843af0c..c72ba31 100644
--- a/PyOCN/utils.py
+++ b/PyOCN/utils.py
@@ -10,10 +10,11 @@
 import os
 from typing import Any, Literal, Callable, TYPE_CHECKING, Union
 from numbers import Number
+from functools import partial
+
 import networkx as nx
 import numpy as np
 from tqdm import tqdm
-from functools import partial
 
 import PyOCN._libocn_bindings as _bindings
 from PyOCN._statushandler import check_status
@@ -50,11 +51,11 @@ def net_type_to_dag(net_type:Literal["I", "H", "V", "E"], dims:tuple, pbar: bool
           ::
 
               O  O  O  O  O
-               \  \ | /  /
+               \\  \\ | /  /
               O  O  O  O  O
-               \  \ | /  /
+               \\  \\ | /  /
               O  O  O  O  O
-               \  \ | /  /
+               \\  \\ | /  /
               O--O--X--O--O
 
         - "H":
@@ -192,7 +193,7 @@ def simulated_annealing_schedule(dims: tuple[int, int],E0: float,constant_phase:
 
     .. math::
 
-        T_i = E_0 \exp\left(-\\frac{r\cdot(i - n_0)}{N}\\right),
+        T_i = E_0 \\exp\\left(-\\frac{r\\cdot(i - n_0)}{N}\\right),
 
     where ``E0`` is the initial energy, ``n0`` is the number of iterations in
     the constant phase, ``r`` is the cooling rate,and ``N = rows * cols``.
diff --git a/docs/conf.py b/docs/conf.py
index 16f46f1..10f9cb8 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -14,7 +14,7 @@
 project = 'PyOCN'
 copyright = '2025, Alexander S. Fox'
 author = 'Alexander S. Fox'
-version = "1.5.20251121"
+version = "1.6.20251210"
 
 # -- General configuration ---------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
diff --git a/pyproject.toml b/pyproject.toml
index 3469e2d..c0ed2b9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "PyOCN"
-version = "1.5.20251121"
+version = "1.6.20251210"
 description = "Optimal Channel Networks (OCN) in Python with a C core"
 readme = "readme.md"
 requires-python = ">=3.10"
@@ -14,13 +14,13 @@ dependencies = [
   "numpy>=1.24",  # should check the actual min
   "networkx>=3.1",  # should check the actual min
   "tqdm>=4.44",  # should check the actual min
-  "matplotlib>=3.5",  # should check the actual min
 ]
 
 [project.optional-dependencies]
 raster = [
   "rasterio>=1.3",
   "xarray>=2024.2",  
+  "matplotlib>=3.5",  # should check the actual min
 ]
 
 [project.urls]
diff --git a/setup.py b/setup.py
index d489d2b..9d97891 100644
--- a/setup.py
+++ b/setup.py
@@ -18,12 +18,28 @@
 sources = [str(s) for s in sources]
 
 if sys.platform.startswith(("linux", "darwin")):
-    extra_compile_args = ["-O3", "-flto", "-fPIC", "-std=c99", "-Wall", "-pedantic", "-march=native"]
-    extra_link_args = ["-O3", "-flto"]
+    extra_compile_args = [
+        "-O3",
+        "-flto", 
+        "-fPIC", 
+        "-std=c99", 
+        "-Wall", 
+        "-pedantic", 
+        "-march=native"
+    ]
+    extra_link_args = [
+        "-O3",
+        "-flto"
+    ]
     # Link libm for pow() on Unix
     libraries = ["m"]
 elif sys.platform.startswith("win"):
-    extra_compile_args = ["/O2"]
+    extra_compile_args = [
+        "/O2",
+        "/Ot",
+        "/Oi",
+        "/Oy",
+    ]
     extra_link_args = []
     libraries = []
 else:
diff --git a/tests/test_basic.py b/tests/test_basic.py
index cddf7d3..c855aad 100644
--- a/tests/test_basic.py
+++ b/tests/test_basic.py
@@ -207,26 +207,17 @@ def test_periodic_boundaries(self):
 
     def test_fit_convergence(self):
         """Test that early exit works as intended."""
-        ocn = po.OCN.from_net_type("E", dims=(20, 20), random_state=7777)
-        
-        ocn.fit(n_iterations=20*20*500, pbar=False, tol=1e-4)
-        final_energy = ocn.energy
+        ocn = po.OCN.from_net_type("E", dims=(32, 32), random_state=1497028)
+        ocn.fit(tol=1e-4, pbar=False, max_iterations_per_loop=1000, n_iterations=32*32*100)
+        self.assertEqual(ocn.history.shape[0], 37)
 
-        self.assertIsInstance(final_energy, float)
-        self.assertGreater(len(ocn.history), 0)
-        
-        # History should have correct structure: [iteration, energy, temperature]
-        self.assertEqual(ocn.history.shape[1], 3)
-        self.assertTrue(np.all(ocn.history[:, 0] >= 0))  # iterations >= 0
-        self.assertTrue(np.all(ocn.history[:, 1] > 0))   # energy > 0
-        self.assertTrue(np.all(ocn.history[:, 2] >= 0))  # temperature >= 0
-        
-        # final energy should match last history entry
-        self.assertEqual(ocn.energy, ocn.history[-1, 1])
+        ocn = po.OCN.from_net_type("E", dims=(32, 32), random_state=1497028)
+        ocn.fit(tol=1e-8, pbar=False, max_iterations_per_loop=1000, n_iterations=32*32*100)
+        self.assertEqual(ocn.history.shape[0], 51)
 
-        # iteration should have stopped before max iterations and should be less than tol
-        self.assertLess(ocn.history[-1, 0], 20*20*500)
-        self.assertLessEqual((ocn.history[-1, 1] - ocn.history[-2, 1])/ocn.history[-1, 1], 1e-4)
+        ocn = po.OCN.from_net_type("E", dims=(32, 32), random_state=1497028)
+        ocn.fit(tol=None, pbar=False, max_iterations_per_loop=1000, n_iterations=32*32*100)
+        self.assertEqual(ocn.history.shape[0], 104)
 
     def test_single_iteration(self):
         """Test single iteration method."""