From 160174906ac1ee96cdd1dd7d296f267aad52d145 Mon Sep 17 00:00:00 2001 From: Mikel Cortes Date: Mon, 15 Dec 2025 15:05:53 +0100 Subject: [PATCH 1/4] remove extra code entry --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 8de06b3..9e133c0 100644 --- a/README.md +++ b/README.md @@ -181,6 +181,5 @@ python -m http.server -d _site df = load_parquet("my_data") # Visualize... ``` - ``` 4. **Add to site** in `_quarto.yml` navbar From d673b83e80461895be3834ec56d55b095ae891ba Mon Sep 17 00:00:00 2001 From: Mikel Cortes Date: Tue, 16 Dec 2025 12:06:39 +0100 Subject: [PATCH 2/4] first draft --- _quarto.yml | 6 +---- notebooks/04-network-overview.qmd | 17 ++++++++++++ pyproject.toml | 1 + queries/__init__.py | 5 ++++ queries/network_overview.py | 44 +++++++++++++++++++++++++++++++ scripts/fetch_data.py | 2 ++ uv.lock | 30 ++++++++++++++++++++- 7 files changed, 99 insertions(+), 6 deletions(-) create mode 100644 notebooks/04-network-overview.qmd create mode 100644 queries/network_overview.py diff --git a/_quarto.yml b/_quarto.yml index bcf684a..dd188fc 100644 --- a/_quarto.yml +++ b/_quarto.yml @@ -15,7 +15,7 @@ website: contents: - text: Introduction href: index.qmd - - section: '2025-12-09' + - section: '2025-12-14' contents: - text: Blob inclusion href: notebooks/01-blob-inclusion.qmd @@ -23,10 +23,6 @@ website: href: notebooks/02-blob-flow.qmd - text: Column propagation href: notebooks/03-column-propagation.qmd - - section: Historical - contents: - - text: '2025-12-08' - href: 20251208/index.qmd format: html: theme: diff --git a/notebooks/04-network-overview.qmd b/notebooks/04-network-overview.qmd new file mode 100644 index 0000000..3186e42 --- /dev/null +++ b/notebooks/04-network-overview.qmd @@ -0,0 +1,17 @@ +--- +tittle: "Network Overview" +--- + +Analysis script to compute the overall network overview from the Xatu sentry nodes on Ethereum mainnet. + +```{python} +#| tags: [parameters] +target_date = None # Set via papermill, or auto-detect from manifest +``` + +```{python} +import polars as pl +import plotly.express as px + +from loaders import load_parquet +``` diff --git a/pyproject.toml b/pyproject.toml index a2c210a..44ddfdb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,6 +14,7 @@ dependencies = [ "jupytext>=1.18.1", "jupyterlab>=4.5.0", "pyyaml>=6.0.3", + "polars>=1.36.1", ] [dependency-groups] diff --git a/queries/__init__.py b/queries/__init__.py index a767c6d..8c5f76a 100644 --- a/queries/__init__.py +++ b/queries/__init__.py @@ -12,6 +12,9 @@ ) from queries.blob_flow import fetch_proposer_blobs from queries.column_propagation import fetch_col_first_seen, NUM_COLUMNS +from queries.network_overview import ( + fetch_unique_network_participants_per_client, +) __all__ = [ # Blob inclusion @@ -24,4 +27,6 @@ # Column propagation "fetch_col_first_seen", "NUM_COLUMNS", + # Network overview + "fetch_unique_network_participants_per_client", ] diff --git a/queries/network_overview.py b/queries/network_overview.py new file mode 100644 index 0000000..b081775 --- /dev/null +++ b/queries/network_overview.py @@ -0,0 +1,44 @@ +""" +Fetch functions for network overview analysis. + +Each function executes SQL and writes directly to Parquet. +""" + +from pathlib import Path + + +def _get_date_filter(target_date: str, column: str = "slot_start_date_time") -> str: + """Generate SQL date filter for a specific date.""" + return f"{column} BETWEEN '{target_date}' AND '{target_date}'::date + INTERVAL 1 DAY" + + +def fetch_unique_network_participants_per_client( + client, + target_date: str, + output_path: Path, + network: str = "mainnet", +) -> int: + """Fetch blobs per slot data and write to Parquet. + + Returns row count. + """ + date_filter = _get_date_filter(target_date, column="event_date_time") + + query = f""" +SELECT + toStartOfInterval(event_date_time, INTERVAL 1 hour) AS hour_bucket, + remote_peer_id_unique_key as peer_id, + remote_agent_implementation as client_name, + meta_client_name as local_name +FROM libp2p_synthetic_heartbeat_local +WHERE + meta_network_name LIKE 'mainnet' + AND {date_filter} +ORDER BY hour_bucket ASC +""" + + df = client.query_df(query) + output_path.parent.mkdir(parents=True, exist_ok=True) + df.to_parquet(output_path, index=False) + return len(df) + diff --git a/scripts/fetch_data.py b/scripts/fetch_data.py index 06e0855..7afff16 100644 --- a/scripts/fetch_data.py +++ b/scripts/fetch_data.py @@ -29,6 +29,7 @@ fetch_slot_in_epoch, fetch_proposer_blobs, fetch_col_first_seen, + fetch_unique_network_participants_per_client, ) # List of (name, fetcher) tuples @@ -39,6 +40,7 @@ ("slot_in_epoch", fetch_slot_in_epoch), ("proposer_blobs", fetch_proposer_blobs), ("col_first_seen", fetch_col_first_seen), + ("unique_network_participants_per_client", fetch_unique_network_participants_per_client), ] diff --git a/uv.lock b/uv.lock index b60929a..c3da5d2 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 2 +revision = 3 requires-python = ">=3.13" resolution-markers = [ "python_full_version >= '3.14'", @@ -1175,6 +1175,7 @@ dependencies = [ { name = "numpy" }, { name = "pandas" }, { name = "plotly" }, + { name = "polars" }, { name = "pyarrow" }, { name = "python-dotenv" }, { name = "pyyaml" }, @@ -1197,6 +1198,7 @@ requires-dist = [ { name = "numpy", specifier = ">=1.26" }, { name = "pandas", specifier = ">=2.0" }, { name = "plotly", specifier = ">=5.0" }, + { name = "polars", specifier = ">=1.36.1" }, { name = "pyarrow", specifier = ">=22.0.0" }, { name = "python-dotenv", specifier = ">=1.0" }, { name = "pyyaml", specifier = ">=6.0.3" }, @@ -1244,6 +1246,32 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e7/c3/3031c931098de393393e1f93a38dc9ed6805d86bb801acc3cf2d5bd1e6b7/plotly-6.5.0-py3-none-any.whl", hash = "sha256:5ac851e100367735250206788a2b1325412aa4a4917a4fe3e6f0bc5aa6f3d90a", size = 9893174, upload-time = "2025-11-17T18:39:20.351Z" }, ] +[[package]] +name = "polars" +version = "1.36.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "polars-runtime-32" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/9f/dc/56f2a90c79a2cb13f9e956eab6385effe54216ae7a2068b3a6406bae4345/polars-1.36.1.tar.gz", hash = "sha256:12c7616a2305559144711ab73eaa18814f7aa898c522e7645014b68f1432d54c", size = 711993, upload-time = "2025-12-10T01:14:53.033Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f6/c6/36a1b874036b49893ecae0ac44a2f63d1a76e6212631a5b2f50a86e0e8af/polars-1.36.1-py3-none-any.whl", hash = "sha256:853c1bbb237add6a5f6d133c15094a9b727d66dd6a4eb91dbb07cdb056b2b8ef", size = 802429, upload-time = "2025-12-10T01:13:53.838Z" }, +] + +[[package]] +name = "polars-runtime-32" +version = "1.36.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/31/df/597c0ef5eb8d761a16d72327846599b57c5d40d7f9e74306fc154aba8c37/polars_runtime_32-1.36.1.tar.gz", hash = "sha256:201c2cfd80ceb5d5cd7b63085b5fd08d6ae6554f922bcb941035e39638528a09", size = 2788751, upload-time = "2025-12-10T01:14:54.172Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e1/ea/871129a2d296966c0925b078a9a93c6c5e7facb1c5eebfcd3d5811aeddc1/polars_runtime_32-1.36.1-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:327b621ca82594f277751f7e23d4b939ebd1be18d54b4cdf7a2f8406cecc18b2", size = 43494311, upload-time = "2025-12-10T01:13:56.096Z" }, + { url = "https://files.pythonhosted.org/packages/d8/76/0038210ad1e526ce5bb2933b13760d6b986b3045eccc1338e661bd656f77/polars_runtime_32-1.36.1-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:ab0d1f23084afee2b97de8c37aa3e02ec3569749ae39571bd89e7a8b11ae9e83", size = 39300602, upload-time = "2025-12-10T01:13:59.366Z" }, + { url = "https://files.pythonhosted.org/packages/54/1e/2707bee75a780a953a77a2c59829ee90ef55708f02fc4add761c579bf76e/polars_runtime_32-1.36.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:899b9ad2e47ceb31eb157f27a09dbc2047efbf4969a923a6b1ba7f0412c3e64c", size = 44511780, upload-time = "2025-12-10T01:14:02.285Z" }, + { url = "https://files.pythonhosted.org/packages/11/b2/3fede95feee441be64b4bcb32444679a8fbb7a453a10251583053f6efe52/polars_runtime_32-1.36.1-cp39-abi3-manylinux_2_24_aarch64.whl", hash = "sha256:d9d077bb9df711bc635a86540df48242bb91975b353e53ef261c6fae6cb0948f", size = 40688448, upload-time = "2025-12-10T01:14:05.131Z" }, + { url = "https://files.pythonhosted.org/packages/05/0f/e629713a72999939b7b4bfdbf030a32794db588b04fdf3dc977dd8ea6c53/polars_runtime_32-1.36.1-cp39-abi3-win_amd64.whl", hash = "sha256:cc17101f28c9a169ff8b5b8d4977a3683cd403621841623825525f440b564cf0", size = 44464898, upload-time = "2025-12-10T01:14:08.296Z" }, + { url = "https://files.pythonhosted.org/packages/d1/d8/a12e6aa14f63784cead437083319ec7cece0d5bb9a5bfe7678cc6578b52a/polars_runtime_32-1.36.1-cp39-abi3-win_arm64.whl", hash = "sha256:809e73857be71250141225ddd5d2b30c97e6340aeaa0d445f930e01bef6888dc", size = 39798896, upload-time = "2025-12-10T01:14:11.568Z" }, +] + [[package]] name = "prometheus-client" version = "0.23.1" From d4e6ffbe03d01e313881dddf81a8dc0841d7b21c Mon Sep 17 00:00:00 2001 From: Mikel Cortes Date: Tue, 16 Dec 2025 18:38:34 +0100 Subject: [PATCH 3/4] working commit --- _quarto.yml | 2 + index.qmd | 1 + notebooks/04-network-overview.qmd | 62 ++++++++++++++++++++++++++++++- scripts/prepare_publish.py | 1 + 4 files changed, 65 insertions(+), 1 deletion(-) diff --git a/_quarto.yml b/_quarto.yml index dd188fc..0ca7b37 100644 --- a/_quarto.yml +++ b/_quarto.yml @@ -23,6 +23,8 @@ website: href: notebooks/02-blob-flow.qmd - text: Column propagation href: notebooks/03-column-propagation.qmd + - text: Network overview + href: notebooks/04-network-overview.qmd format: html: theme: diff --git a/index.qmd b/index.qmd index cb90a50..5d2c135 100644 --- a/index.qmd +++ b/index.qmd @@ -11,6 +11,7 @@ A collection of notebooks analyzing P2P dynamics in Ethereum networks. Currently - [Blob inclusion](notebooks/01-blob-inclusion.qmd): Blob inclusion patterns per block and epoch. - [Blob flow](notebooks/02-blob-flow.qmd): Flow diagrams tracing blob packing per entities, builders, and relays. - [Column propagation](notebooks/03-column-propagation.qmd): Column propagation timing across 128 data columns subnets. +- [Network Overview](notebooks/04-network-overview.qmd): General view of the p2p network. ## Generation diff --git a/notebooks/04-network-overview.qmd b/notebooks/04-network-overview.qmd index 3186e42..ed63764 100644 --- a/notebooks/04-network-overview.qmd +++ b/notebooks/04-network-overview.qmd @@ -12,6 +12,66 @@ target_date = None # Set via papermill, or auto-detect from manifest ```{python} import polars as pl import plotly.express as px - from loaders import load_parquet + +raw_df = load_parquet("unique_network_participants_per_client", target_date) +``` + +## Total unique peers in the network + +```{python} +# Display the number of unique peers in the network +raw_df = load_parquet("unique_network_participants_per_client", target_date) + +# get the number of unique peers +df = ( + pl.from_pandas(raw_df) + .group_by("hour_bucket") + .agg(unique_peers=pl.col("peer_id").n_unique()) + .sort("hour_bucket") +) + +fig = px.line( + df, + x="hour_bucket", + y="unique_peers", +) + +fig.update_layout( + title="Total number of unique peers", + xaxis_title="Date", + yaxis_title="Unique peers", + font_size=12, + width=1200, + height=800, +) +``` + +## Client distribution of the unique peers + +```{python} +# get the number of unique peers +df = ( + pl.from_pandas(raw_df) + .group_by(["hour_bucket", "client_name"]) + .agg(unique_peers=pl.col("peer_id").n_unique()) + .sort("hour_bucket") +) + +fig = px.area( + df, + x="hour_bucket", + y="unique_peers", + color="client_name", +) + +fig.update_layout( + title="Total number of unique peers", + xaxis_title="Date", + yaxis_title="Unique peers", + font_size=12, + width=1200, + height=800, +) ``` + diff --git a/scripts/prepare_publish.py b/scripts/prepare_publish.py index 63fa426..c441da0 100644 --- a/scripts/prepare_publish.py +++ b/scripts/prepare_publish.py @@ -20,6 +20,7 @@ ("01-blob-inclusion", "Blob inclusion"), ("02-blob-flow", "Blob flow"), ("03-column-propagation", "Column propagation"), + ("04-network-overview", "Network overview"), ] DATA_ROOT = Path("notebooks/data") From f501fa67f65f87103dd8be9e81042f023ab9610e Mon Sep 17 00:00:00 2001 From: Mikel Cortes Date: Thu, 18 Dec 2025 10:42:54 +0100 Subject: [PATCH 4/4] swap gsp heartbeat table to connected_peers one --- notebooks/04-network-overview.qmd | 125 ++++++++++++++++++++++++++---- queries/__init__.py | 4 +- queries/network_overview.py | 17 ++-- scripts/fetch_data.py | 4 +- 4 files changed, 127 insertions(+), 23 deletions(-) diff --git a/notebooks/04-network-overview.qmd b/notebooks/04-network-overview.qmd index ed63764..d16627a 100644 --- a/notebooks/04-network-overview.qmd +++ b/notebooks/04-network-overview.qmd @@ -7,6 +7,7 @@ Analysis script to compute the overall network overview from the Xatu sentry nod ```{python} #| tags: [parameters] target_date = None # Set via papermill, or auto-detect from manifest +network = None # Set via papermill, or auto-detect from manifest ``` ```{python} @@ -14,16 +15,13 @@ import polars as pl import plotly.express as px from loaders import load_parquet -raw_df = load_parquet("unique_network_participants_per_client", target_date) +raw_df = load_parquet("xatu_client_connectivity", target_date) ``` ## Total unique peers in the network ```{python} # Display the number of unique peers in the network -raw_df = load_parquet("unique_network_participants_per_client", target_date) - -# get the number of unique peers df = ( pl.from_pandas(raw_df) .group_by("hour_bucket") @@ -41,9 +39,6 @@ fig.update_layout( title="Total number of unique peers", xaxis_title="Date", yaxis_title="Unique peers", - font_size=12, - width=1200, - height=800, ) ``` @@ -53,25 +48,129 @@ fig.update_layout( # get the number of unique peers df = ( pl.from_pandas(raw_df) - .group_by(["hour_bucket", "client_name"]) - .agg(unique_peers=pl.col("peer_id").n_unique()) - .sort("hour_bucket") + .sort(["hour_bucket", "peer_id", "client_name"], descending=[False, False, True]) + .unique(subset=["hour_bucket", "peer_id"], keep="first") + .filter( + pl.col("client_name").is_not_null() & (pl.col("client_name") != "") + ) + .group_by(["hour_bucket","client_name"]) + .agg(peers=pl.len()) + .sort("hour_bucket", "peers") ) fig = px.area( df, x="hour_bucket", - y="unique_peers", + y="peers", color="client_name", ) fig.update_layout( title="Total number of unique peers", xaxis_title="Date", - yaxis_title="Unique peers", - font_size=12, + yaxis_title="Peers", + width=1200, + height=800, +) +``` + +## Number of connections from each Xatu node + +```{python} +# Plot the number of connections per each Xatu node +df = ( + pl.from_pandas(raw_df) + .group_by(["hour_bucket", "local_name"]) + .agg(peers=pl.col("peer_id").n_unique()) + .sort("hour_bucket") + .with_columns( + pl.col("local_name").str.replace(f"ethpandaops/{network}/", "") + ) +) + +fig = px.line( + df, + x="hour_bucket", + y="peers", + color="local_name", +) + +fig.update_layout( + title="Connections per Xatu nodes", + xaxis_title=None, + yaxis_title="Connected peers", + legend=dict( + title="Client Names", + orientation = "h", + yanchor="top", + y=-.25, + xanchor="center", + x=0.5, + # entrywidth=300, + ), + width=1200, + height=800, +) +``` + +## Distribution of connections to peers on each IP protocol + Transport protocol combination + +```{python} +df = ( + pl.from_pandas(raw_df) + .group_by(["hour_bucket", "peer_id", "protocol"]) + .agg( + all_transports=pl.col("transport_protocol").unique().sort().str.join(" & ") + ) + .with_columns( + protocol_combos=pl.col("protocol") + " + (" + pl.col("all_transports") + ")" + ) + .group_by(["hour_bucket", "protocol_combos"]) + .agg(peers=pl.count("peer_id")) + .sort("hour_bucket") +) + +fig = px.line( + df, + x="hour_bucket", + y="peers", + color="protocol_combos", +) + +fig.update_layout( + title="Transport protocol distribution for Xatu nodes", + yaxis_title="Connected peers", width=1200, height=800, ) + ``` +## Popularity of ports + +```{python} +df = ( + pl.from_pandas(raw_df) + # this might double count peers that use different ports in the same day + .group_by(["peer_id", "port"]) + .agg() + .group_by("port") + .agg(peers=pl.count("peer_id")) + .with_columns(port=pl.col("port").cast(pl.String)) + .sort("peers", descending=True) +) + +fig = px.bar( + df.head(20), + x="port", + y="peers", +) +fig.update_xaxes(type='category') +fig.update_layout( + title="Popularity of ports", + xaxis_title=None, + yaxis_title="Connected peers", + width=1200, + height=800, +) +``` \ No newline at end of file diff --git a/queries/__init__.py b/queries/__init__.py index 8c5f76a..2b31343 100644 --- a/queries/__init__.py +++ b/queries/__init__.py @@ -13,7 +13,7 @@ from queries.blob_flow import fetch_proposer_blobs from queries.column_propagation import fetch_col_first_seen, NUM_COLUMNS from queries.network_overview import ( - fetch_unique_network_participants_per_client, + fetch_xatu_client_connectivity, ) __all__ = [ @@ -28,5 +28,5 @@ "fetch_col_first_seen", "NUM_COLUMNS", # Network overview - "fetch_unique_network_participants_per_client", + "fetch_xatu_client_connectivity", ] diff --git a/queries/network_overview.py b/queries/network_overview.py index b081775..ac84c09 100644 --- a/queries/network_overview.py +++ b/queries/network_overview.py @@ -12,13 +12,14 @@ def _get_date_filter(target_date: str, column: str = "slot_start_date_time") -> return f"{column} BETWEEN '{target_date}' AND '{target_date}'::date + INTERVAL 1 DAY" -def fetch_unique_network_participants_per_client( +def fetch_xatu_client_connectivity( client, target_date: str, output_path: Path, network: str = "mainnet", ) -> int: - """Fetch blobs per slot data and write to Parquet. + """Fetch the unique number of peer_ids know using the gossipsub synthetic_heartbeat + data and write to Parquet. Returns row count. """ @@ -28,12 +29,16 @@ def fetch_unique_network_participants_per_client( SELECT toStartOfInterval(event_date_time, INTERVAL 1 hour) AS hour_bucket, remote_peer_id_unique_key as peer_id, + remote_protocol as protocol, + remote_transport_protocol as transport_protocol, + remote_port as port, remote_agent_implementation as client_name, - meta_client_name as local_name -FROM libp2p_synthetic_heartbeat_local + meta_client_name as local_name, + remote_geo_country_code as geo_country_code +FROM libp2p_connected_local WHERE - meta_network_name LIKE 'mainnet' - AND {date_filter} + meta_network_name LIKE '{network}' + AND {date_filter} ORDER BY hour_bucket ASC """ diff --git a/scripts/fetch_data.py b/scripts/fetch_data.py index 7afff16..6f1e4f9 100644 --- a/scripts/fetch_data.py +++ b/scripts/fetch_data.py @@ -29,7 +29,7 @@ fetch_slot_in_epoch, fetch_proposer_blobs, fetch_col_first_seen, - fetch_unique_network_participants_per_client, + fetch_xatu_client_connectivity, ) # List of (name, fetcher) tuples @@ -40,7 +40,7 @@ ("slot_in_epoch", fetch_slot_in_epoch), ("proposer_blobs", fetch_proposer_blobs), ("col_first_seen", fetch_col_first_seen), - ("unique_network_participants_per_client", fetch_unique_network_participants_per_client), + ("xatu_client_connectivity", fetch_xatu_client_connectivity), ]