From 3d4218c5633a8b721f001a5b90974c1cdd93e74f Mon Sep 17 00:00:00 2001 From: Ravi Kumar Suresh Babu Date: Thu, 29 Jan 2026 14:57:19 -0800 Subject: [PATCH 1/6] changes --- CHANGELOG.md | 1 + src/snowflake/snowpark/dataframe_analytics_functions.py | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c9b5998f16..427c0fd0f0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,7 @@ #### Improvements - `snowflake.snowpark.context.configure_development_features` is effective for multiple sessions including newly created sessions after the configuration. No duplicate experimental warning any more. - Removed experimental warning from `DataFrame.to_arrow` and `DataFrame.to_arrow_batches`. +- Removed experimental warning from `DataFrameAnalyticsFunctions.time_series_agg()`. ### Snowpark pandas API Updates diff --git a/src/snowflake/snowpark/dataframe_analytics_functions.py b/src/snowflake/snowpark/dataframe_analytics_functions.py index f60159d4b8..8c146e09cb 100644 --- a/src/snowflake/snowpark/dataframe_analytics_functions.py +++ b/src/snowflake/snowpark/dataframe_analytics_functions.py @@ -10,7 +10,7 @@ build_expr_from_snowpark_column_or_col_name, with_src_position, ) -from snowflake.snowpark._internal.utils import experimental, publicapi, warning +from snowflake.snowpark._internal.utils import publicapi, warning from snowflake.snowpark.column import Column, _to_col_if_str from snowflake.snowpark.functions import ( _call_function, @@ -629,7 +629,6 @@ def compute_lead( return df - @experimental(version="1.12.0") @publicapi def time_series_agg( self, From 5de9f44a470c7a22381d1703402bf90d5a6f00d3 Mon Sep 17 00:00:00 2001 From: Ravi Kumar Suresh Babu Date: Thu, 29 Jan 2026 15:10:27 -0800 Subject: [PATCH 2/6] SNOW-2680714: Fix data leakage in time_series_agg by excluding current row Breaking change in v1.45.0: - Changed window frame to exclude current row to prevent data leakage in ML use cases - Past windows: now use -interval to 1 PRECEDING (was: -interval to CURRENT ROW) - Future windows: now use 1 FOLLOWING to interval (was: CURRENT ROW to interval) - Updated all integration tests with new expected values - Updated proto file with breaking change comment - Updated CHANGELOG with breaking change section --- CHANGELOG.md | 7 ++ .../snowpark/_internal/proto/ast.proto | 2 + .../snowpark/dataframe_analytics_functions.py | 8 +- tests/integ/test_df_analytics.py | 91 +++++++++++++++---- 4 files changed, 86 insertions(+), 22 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 427c0fd0f0..0d5a44c46a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,13 @@ - Fixed a bug that opentelemetry is not correctly import when using `Session.client_telemetry.enable_event_table_telemetry_collection`. +#### Breaking Changes + +- **`DataFrameAnalyticsFunctions.time_series_agg()`**: The current row is now excluded from window aggregations to prevent data leakage in ML use cases. + - For past windows (negative), the range is now from `-interval` to `1 PRECEDING` (was `-interval` to `CURRENT ROW`). + - For future windows (positive), the range is now from `1 FOLLOWING` to `interval` (was `CURRENT ROW` to `interval`). + - This change affects the computed aggregation values. Update your queries accordingly if you rely on the old behavior. + #### Improvements - `snowflake.snowpark.context.configure_development_features` is effective for multiple sessions including newly created sessions after the configuration. No duplicate experimental warning any more. - Removed experimental warning from `DataFrame.to_arrow` and `DataFrame.to_arrow_batches`. diff --git a/src/snowflake/snowpark/_internal/proto/ast.proto b/src/snowflake/snowpark/_internal/proto/ast.proto index b3f792d4c4..1daa0b940c 100644 --- a/src/snowflake/snowpark/_internal/proto/ast.proto +++ b/src/snowflake/snowpark/_internal/proto/ast.proto @@ -943,6 +943,8 @@ message DataframeAnalyticsMovingAgg { } // dataframe-analytics.ir:45 +// BREAKING CHANGE in v1.45.0: Current row is now excluded from aggregations +// to prevent data leakage. Window ranges changed from CURRENT_ROW to PRECEDING/FOLLOWING. message DataframeAnalyticsTimeSeriesAgg { repeated Tuple_String_List_String aggs = 1; Expr df = 2; diff --git a/src/snowflake/snowpark/dataframe_analytics_functions.py b/src/snowflake/snowpark/dataframe_analytics_functions.py index 8c146e09cb..cc922321fb 100644 --- a/src/snowflake/snowpark/dataframe_analytics_functions.py +++ b/src/snowflake/snowpark/dataframe_analytics_functions.py @@ -642,7 +642,7 @@ def time_series_agg( ) -> "snowflake.snowpark.dataframe.DataFrame": """ Applies aggregations to the specified columns of the DataFrame over specified time windows, - and grouping criteria. + and grouping criteria. The current row is excluded from the aggregation to prevent data leakage to models. Args: aggs: A dictionary where keys are column names and values are lists of the desired aggregation functions. @@ -776,9 +776,11 @@ def time_series_agg( ) if window_sign > 0: - range_start, range_end = Window.CURRENT_ROW, interval + # Future windows: from 1 row ahead to interval + range_start, range_end = Window.FOLLOWING, interval else: - range_start, range_end = -interval, Window.CURRENT_ROW + # Past windows: from -interval to 1 row before + range_start, range_end = -interval, Window.PRECEDING window_spec = ( Window.partition_by(group_by) diff --git a/tests/integ/test_df_analytics.py b/tests/integ/test_df_analytics.py index 8c678ac65b..ed091743ea 100644 --- a/tests/integ/test_df_analytics.py +++ b/tests/integ/test_df_analytics.py @@ -444,16 +444,31 @@ def custom_formatter(input_col, agg, window): "PRODUCTKEY": [101, 101, 101, 102], "ORDERDATE": ["2023-01-01", "2023-01-02", "2023-01-03", "2023-01-04"], "SALESAMOUNT": [200, 100, 300, 250], - "SUM_SALESAMOUNT_1D": [300, 400, 300, 250], - "MAX_SALESAMOUNT_1D": [200, 300, 300, 250], - "SUM_SALESAMOUNT_-1D": [200, 300, 400, 250], - "MAX_SALESAMOUNT_-1D": [200, 200, 300, 250], - "SUM_SALESAMOUNT_2D": [600, 400, 300, 250], - "MAX_SALESAMOUNT_2D": [300, 300, 300, 250], - "SUM_SALESAMOUNT_-2D": [200, 300, 600, 250], - "MAX_SALESAMOUNT_-2D": [200, 200, 300, 250], - "SUM_SALESAMOUNT_-1W": [200, 300, 600, 250], - "MAX_SALESAMOUNT_-1W": [200, 200, 300, 250], + # 1D (future): next day only + "SUM_SALESAMOUNT_1D": [ + 100, + 300, + None, + None, + ], # [100(next), 300(next), NULL, NULL] + "MAX_SALESAMOUNT_1D": [100, 300, None, None], + # -1D (past): previous day only + "SUM_SALESAMOUNT_-1D": [ + None, + 200, + 100, + None, + ], # [NULL, 200(prev), 100(prev), NULL] + "MAX_SALESAMOUNT_-1D": [None, 200, 100, None], + # 2D (future): next 2 days + "SUM_SALESAMOUNT_2D": [400, 300, None, None], # [100+300, 300, NULL, NULL] + "MAX_SALESAMOUNT_2D": [300, 300, None, None], + # -2D (past): previous 2 days + "SUM_SALESAMOUNT_-2D": [None, 200, 300, None], # [NULL, 200, 200+100, NULL] + "MAX_SALESAMOUNT_-2D": [None, 200, 200, None], + # -1W (past): previous week + "SUM_SALESAMOUNT_-1W": [None, 200, 300, None], # [NULL, 200, 200+100, NULL] + "MAX_SALESAMOUNT_-1W": [None, 200, 200, None], } expected_df = pd.DataFrame(expected_data) @@ -500,9 +515,27 @@ def test_time_series_agg_sub_day_sliding_windows(session): ] ), "SALESAMOUNT": [400, 300, 200, 100], - "SALESAMOUNT_MAX_-1s": [400, 400, 200, 100], - "SALESAMOUNT_MAX_-1m": [400, 400, 300, 100], - "SALESAMOUNT_MAX_-1h": [400, 400, 400, 200], + # -1s: previous 1 second only (excludes current) + "SALESAMOUNT_MAX_-1s": [ + None, + 400, + None, + None, + ], # [NULL, 400(1s before), NULL(>1s gap), NULL(>1s gap)] + # -1m: previous 1 minute (excludes current) + "SALESAMOUNT_MAX_-1m": [ + None, + 400, + 300, + None, + ], # [NULL, 400(1s<1m), 300(60s=1m), NULL(>1m gap)] + # -1h: previous 1 hour (excludes current) + "SALESAMOUNT_MAX_-1h": [ + None, + 400, + 400, + 200, + ], # [NULL, 400, MAX(400,300), 200(exactly 1h)] } expected_df = pd.DataFrame(expected_data) @@ -552,8 +585,20 @@ def test_time_series_aggregation_grouping_bug_fix(session): "transaction_4", ], "QUANTITY": [10, 15, 7, 3], - "QUANTITY_SUM_-1D": [10, 15, 22, 3], - "QUANTITY_SUM_-7D": [10, 15, 22, 25], + # -1D: previous 1 day (excludes current) + "QUANTITY_SUM_-1D": [ + None, + None, + 15, + None, + ], # [NULL, NULL, 15(8h earlier), NULL(>1D gap)] + # -7D: previous 7 days (excludes current) + "QUANTITY_SUM_-7D": [ + None, + None, + 15, + 22, + ], # [NULL, NULL, 15(same day), 15+7(2D ago)] } expected_df = pd.DataFrame(expected_data) @@ -596,6 +641,8 @@ def custom_formatter(input_col, agg, window): col_formatter=custom_formatter, ) + # BREAKING CHANGE in v1.45.0: Current row excluded from aggregations + # -2mm: previous 2 months (excludes current) expected_data = { "PRODUCTKEY": [101, 101, 101, 101, 102, 102, 102, 102], "ORDERDATE": [ @@ -609,8 +656,10 @@ def custom_formatter(input_col, agg, window): "2023-04-20", ], "SALESAMOUNT": [100, 200, 300, 400, 150, 250, 350, 450], - "SUM_SALESAMOUNT_-2mm": [100, 300, 600, 900, 150, 400, 750, 1050], - "MAX_SALESAMOUNT_-2mm": [100, 200, 300, 400, 150, 250, 350, 450], + # SUM excluding current row + "SUM_SALESAMOUNT_-2mm": [None, 100, 300, 500, None, 150, 400, 600], + # MAX excluding current row + "MAX_SALESAMOUNT_-2mm": [None, 100, 200, 300, None, 150, 250, 350], } expected_df = pd.DataFrame(expected_data) expected_df["ORDERDATE"] = pd.to_datetime(expected_df["ORDERDATE"]) @@ -655,6 +704,8 @@ def custom_formatter(input_col, agg, window): col_formatter=custom_formatter, ) + # BREAKING CHANGE in v1.45.0: Current row excluded from aggregations + # -1Y: previous 1 year (excludes current) expected_data = { "PRODUCTKEY": [101, 101, 101, 101, 102, 102, 102, 102], "ORDERDATE": [ @@ -668,8 +719,10 @@ def custom_formatter(input_col, agg, window): "2024-01-20", ], "SALESAMOUNT": [100, 200, 300, 400, 150, 250, 350, 450], - "SUM_SALESAMOUNT_-1Y": [100, 300, 500, 700, 150, 400, 600, 800], - "MAX_SALESAMOUNT_-1Y": [100, 200, 300, 400, 150, 250, 350, 450], + # SUM excluding current row + "SUM_SALESAMOUNT_-1Y": [None, 100, 200, 300, None, 150, 250, 350], + # MAX excluding current row + "MAX_SALESAMOUNT_-1Y": [None, 100, 200, 300, None, 150, 250, 350], } expected_df = pd.DataFrame(expected_data) expected_df["ORDERDATE"] = pd.to_datetime(expected_df["ORDERDATE"]) From 525d17cbfc3e93283e28296811d0ab940b96e341 Mon Sep 17 00:00:00 2001 From: Ravi Kumar Suresh Babu Date: Thu, 29 Jan 2026 15:11:51 -0800 Subject: [PATCH 3/6] Remove unnecessary breaking change comments from tests --- tests/integ/test_df_analytics.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/integ/test_df_analytics.py b/tests/integ/test_df_analytics.py index ed091743ea..7f4f0b5cb2 100644 --- a/tests/integ/test_df_analytics.py +++ b/tests/integ/test_df_analytics.py @@ -641,8 +641,6 @@ def custom_formatter(input_col, agg, window): col_formatter=custom_formatter, ) - # BREAKING CHANGE in v1.45.0: Current row excluded from aggregations - # -2mm: previous 2 months (excludes current) expected_data = { "PRODUCTKEY": [101, 101, 101, 101, 102, 102, 102, 102], "ORDERDATE": [ @@ -704,8 +702,6 @@ def custom_formatter(input_col, agg, window): col_formatter=custom_formatter, ) - # BREAKING CHANGE in v1.45.0: Current row excluded from aggregations - # -1Y: previous 1 year (excludes current) expected_data = { "PRODUCTKEY": [101, 101, 101, 101, 102, 102, 102, 102], "ORDERDATE": [ From 773b849d4a55a94a8fa6340f1945d10fdf828823 Mon Sep 17 00:00:00 2001 From: Ravi Kumar Suresh Babu Date: Fri, 30 Jan 2026 06:37:52 -0800 Subject: [PATCH 4/6] fix bug --- CHANGELOG.md | 5 +---- src/snowflake/snowpark/_internal/proto/ast.proto | 2 -- .../snowpark/dataframe_analytics_functions.py | 14 +++++++++----- 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0d5a44c46a..84c4e59587 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,10 +26,7 @@ #### Breaking Changes -- **`DataFrameAnalyticsFunctions.time_series_agg()`**: The current row is now excluded from window aggregations to prevent data leakage in ML use cases. - - For past windows (negative), the range is now from `-interval` to `1 PRECEDING` (was `-interval` to `CURRENT ROW`). - - For future windows (positive), the range is now from `1 FOLLOWING` to `interval` (was `CURRENT ROW` to `interval`). - - This change affects the computed aggregation values. Update your queries accordingly if you rely on the old behavior. +- **`DataFrameAnalyticsFunctions.time_series_agg()`**: The current row is now excluded from window aggregations to prevent data leakage. #### Improvements - `snowflake.snowpark.context.configure_development_features` is effective for multiple sessions including newly created sessions after the configuration. No duplicate experimental warning any more. diff --git a/src/snowflake/snowpark/_internal/proto/ast.proto b/src/snowflake/snowpark/_internal/proto/ast.proto index 1daa0b940c..b3f792d4c4 100644 --- a/src/snowflake/snowpark/_internal/proto/ast.proto +++ b/src/snowflake/snowpark/_internal/proto/ast.proto @@ -943,8 +943,6 @@ message DataframeAnalyticsMovingAgg { } // dataframe-analytics.ir:45 -// BREAKING CHANGE in v1.45.0: Current row is now excluded from aggregations -// to prevent data leakage. Window ranges changed from CURRENT_ROW to PRECEDING/FOLLOWING. message DataframeAnalyticsTimeSeriesAgg { repeated Tuple_String_List_String aggs = 1; Expr df = 2; diff --git a/src/snowflake/snowpark/dataframe_analytics_functions.py b/src/snowflake/snowpark/dataframe_analytics_functions.py index cc922321fb..024edab487 100644 --- a/src/snowflake/snowpark/dataframe_analytics_functions.py +++ b/src/snowflake/snowpark/dataframe_analytics_functions.py @@ -642,7 +642,10 @@ def time_series_agg( ) -> "snowflake.snowpark.dataframe.DataFrame": """ Applies aggregations to the specified columns of the DataFrame over specified time windows, - and grouping criteria. The current row is excluded from the aggregation to prevent data leakage to models. + and grouping criteria. + + .. note:: + Aggregations exclude rows within 1 second of the current timestamp to prevent data leakage. Args: aggs: A dictionary where keys are column names and values are lists of the desired aggregation functions. @@ -775,12 +778,13 @@ def time_series_agg( years=interval_args.get("y"), ) + one_second = make_interval(seconds=1) if window_sign > 0: - # Future windows: from 1 row ahead to interval - range_start, range_end = Window.FOLLOWING, interval + range_start = one_second + range_end = interval else: - # Past windows: from -interval to 1 row before - range_start, range_end = -interval, Window.PRECEDING + range_start = -interval + range_end = -one_second window_spec = ( Window.partition_by(group_by) From 81421d99e941c9ac43d038c6f72bb493c802eb98 Mon Sep 17 00:00:00 2001 From: Ravi Kumar Suresh Babu Date: Sun, 1 Feb 2026 19:04:29 -0800 Subject: [PATCH 5/6] fix change log --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 84c4e59587..bc616e0081 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,12 +26,12 @@ #### Breaking Changes -- **`DataFrameAnalyticsFunctions.time_series_agg()`**: The current row is now excluded from window aggregations to prevent data leakage. #### Improvements - `snowflake.snowpark.context.configure_development_features` is effective for multiple sessions including newly created sessions after the configuration. No duplicate experimental warning any more. - Removed experimental warning from `DataFrame.to_arrow` and `DataFrame.to_arrow_batches`. - Removed experimental warning from `DataFrameAnalyticsFunctions.time_series_agg()`. +- **`DataFrameAnalyticsFunctions.time_series_agg()`**: The current row is now excluded from window aggregations to prevent data leakage. ### Snowpark pandas API Updates From f7d34f9e5e57d472e785b8555cedadfbd871a2ff Mon Sep 17 00:00:00 2001 From: Ravi Kumar Suresh Babu Date: Mon, 9 Feb 2026 06:34:49 -0800 Subject: [PATCH 6/6] fix changelog --- CHANGELOG.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bc616e0081..99ee67188a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,9 +24,6 @@ - Fixed a bug that opentelemetry is not correctly import when using `Session.client_telemetry.enable_event_table_telemetry_collection`. -#### Breaking Changes - - #### Improvements - `snowflake.snowpark.context.configure_development_features` is effective for multiple sessions including newly created sessions after the configuration. No duplicate experimental warning any more. - Removed experimental warning from `DataFrame.to_arrow` and `DataFrame.to_arrow_batches`.