augment_expanding

augment_expanding(
    data,
    date_column,
    value_column,
    window_func='mean',
    min_periods=None,
    engine='pandas',
    threads=1,
    show_progress=True,
    reduce_memory=False,
    **kwargs,
)

Apply one or more Series-based expanding functions to one or more columns of a DataFrame.

Parameters

Name	Type	Description	Default
data	Union[pd.DataFrame, pd.core.groupby.generic.DataFrameGroupBy]	Input data to be processed. Can be a Pandas DataFrame or a GroupBy object.	required
date_column	str	Name of the datetime column. Data is sorted by this column within each group.	required
value_column	Union[str, list]	Column(s) to which the expanding window functions should be applied. Can be a single column name or a list.	required
window_func	Union[str, list, Tuple[str, Callable]]	The `window_func` parameter in the `augment_expanding` function specifies the function(s) to be applied to the expanding windows of the value column(s). 1. It can be either: - A string representing the name of a standard function (e.g., ‘mean’, ‘sum’). 2. For custom functions: - Provide a list of tuples. Each tuple should contain a custom name for the function and the function itself. - Each custom function should accept a Pandas Series as its input and operate on that series. Example: (“range”, lambda x: x.max() - x.min()) (See more Examples below.) Note: If your function needs to operate on multiple columns (i.e., it requires access to a DataFrame rather than just a Series), consider using the `augment_expanding_apply` function in this library.	`'mean'`
min_periods	int	Minimum observations in the window to have a value. Defaults to the window size. If set, a value will be produced even if fewer observations are present than the window size.	`None`
engine	str	Specifies the backend computation library for augmenting expanding window functions. The options are: - “pandas” (default): Uses the `pandas` library. - “polars”: Uses the `polars` library, which may offer performance benefits for larger datasets.	`'pandas'`
threads	int	Number of threads to use for parallel processing. If `threads` is set to 1, parallel processing will be disabled. Set to -1 to use all available CPU cores.	`1`
show_progress	bool	If `True`, a progress bar will be displayed during parallel processing.	`True`
reduce_memory	bool	The `reduce_memory` parameter is used to specify whether to reduce the memory usage of the DataFrame by converting int, float to smaller bytes and str to categorical data. This reduces memory for large data but may impact resolution of float and will change str to categorical. Default is True.	`False`
**kwargs	additional keyword arguments	Additional arguments passed to the `pandas.Series.expanding` method when using the Pandas engine.	`{}`

Returns

Name	Type	Description
	pd.DataFrame	The `augment_expanding` function returns a DataFrame with new columns for each applied function, window size, and value column.

Notes

Performance

Polars Engine (3X faster than Pandas)

In most cases, the polars engine will be faster than the pandas engine. Speed tests indicate 3X or more.

Parallel Processing (Pandas Engine Only)

This function uses parallel processing to speed up computation for large datasets with many time series groups:

Parallel processing has overhead and may not be faster on small datasets.

To use parallel processing, set threads = -1 to use all available processors.

Examples

# Example 1 - Pandas Backend for Expanding Window Functions
# This example demonstrates the use of string-named functions
# on an expanding window using the Pandas backend for computations.

import pytimetk as tk
import pandas as pd
import numpy as np

df = tk.load_dataset("m4_daily", parse_dates = ['date'])

expanded_df = (
    df
        .groupby('id')
        .augment_expanding(
            date_column = 'date',
            value_column = 'value',
            window_func = [
                'mean',  # Built-in mean function
                'std',   # Built-in standard deviation function,
                 ('quantile_75', lambda x: pd.Series(x).quantile(0.75)),  # Custom quantile function

            ],
            min_periods = 1,
            engine = 'pandas',  # Utilize pandas for the underlying computations
            threads = 1,  # Disable parallel processing
            show_progress = True,  # Display a progress bar
            )
)
display(expanded_df)

	id	date	value	value_expanding_mean	value_expanding_std	value_expanding_quantile_75
0	D10	2014-07-03	2076.2	2076.200000	NaN	2076.200
1	D10	2014-07-04	2073.4	2074.800000	1.979899	2075.500
2	D10	2014-07-05	2048.7	2066.100000	15.133737	2074.800
3	D10	2014-07-06	2048.9	2061.800000	15.054789	2074.100
4	D10	2014-07-07	2006.4	2050.720000	27.996732	2073.400
...	...	...	...	...	...	...
9738	D500	2012-09-19	9418.8	8286.606679	2456.960489	9903.550
9739	D500	2012-09-20	9365.7	8286.864035	2456.723940	9902.600
9740	D500	2012-09-21	9445.9	8287.140391	2456.496163	9902.075
9741	D500	2012-09-22	9497.9	8287.429011	2456.274422	9901.550
9742	D500	2012-09-23	9545.3	8287.728789	2456.058410	9901.025

9743 rows × 6 columns

# Example 2 - Polars Backend for Expanding Window Functions using Built-Ins
#             (538X Faster than Pandas)
#  This example demonstrates the use of string-named functions and configurable
#  functions using the Polars backend for computations. Configurable functions,
#  like pl_quantile, allow the use of specific parameters associated with their
#  corresponding polars.Expr.rolling_<function_name> method.
#  For instance, pl_quantile corresponds to polars.Expr.rolling_quantile.

import pytimetk as tk
import pandas as pd
import polars as pl
import numpy as np
from pytimetk.utils.polars_helpers import pl_quantile
from pytimetk.utils.pandas_helpers import pd_quantile

df = tk.load_dataset("m4_daily", parse_dates = ['date'])

expanded_df = (
    df
        .groupby('id')
        .augment_expanding(
            date_column = 'date',
            value_column = 'value',
            window_func = [
                'mean',  # Built-in mean function
                'std',   # Built-in std function
                ('quantile_75', pl_quantile(quantile=0.75)),  # Configurable with all parameters found in polars.Expr.rolling_quantile
            ],
            min_periods = 1,
            engine = 'polars',  # Utilize Polars for the underlying computations
        )
)
display(expanded_df)

	id	date	value	value_expanding_mean	value_expanding_std	value_expanding_quantile_75
0	D10	2014-07-03	2076.2	2076.200000	NaN	2076.2
1	D10	2014-07-04	2073.4	2074.800000	1.979899	2076.2
2	D10	2014-07-05	2048.7	2066.100000	15.133737	2076.2
3	D10	2014-07-06	2048.9	2061.800000	15.054789	2076.2
4	D10	2014-07-07	2006.4	2050.720000	27.996732	2073.4
...	...	...	...	...	...	...
9738	D500	2012-09-19	9418.8	8286.606679	2456.960489	9906.4
9739	D500	2012-09-20	9365.7	8286.864035	2456.723940	9902.6
9740	D500	2012-09-21	9445.9	8287.140391	2456.496163	9902.6
9741	D500	2012-09-22	9497.9	8287.429011	2456.274422	9902.6
9742	D500	2012-09-23	9545.3	8287.728789	2456.058410	9902.6

9743 rows × 6 columns

# Example 3 - Lambda Functions for Expanding Window Functions are faster in Pandas than Polars
# This example demonstrates the use of lambda functions of the form lambda x: x
# Identity lambda functions, while convenient, have signficantly slower performance.
# When using lambda functions the Pandas backend will likely be faster than Polars.

import pytimetk as tk
import pandas as pd
import numpy as np

df = tk.load_dataset("m4_daily", parse_dates = ['date'])

expanded_df = (
    df
        .groupby('id')
        .augment_expanding(
            date_column = 'date',
            value_column = 'value',
            window_func = [

                ('range', lambda x: x.max() - x.min()),  # Identity lambda function: can be slower, especially in Polars
            ],
            min_periods = 1,
            engine = 'pandas',  # Utilize pandas for the underlying computations
        )
)
display(expanded_df)

	id	date	value	value_expanding_range
0	D10	2014-07-03	2076.2	0.0
1	D10	2014-07-04	2073.4	2.8
2	D10	2014-07-05	2048.7	27.5
3	D10	2014-07-06	2048.9	27.5
4	D10	2014-07-07	2006.4	69.8
...	...	...	...	...
9738	D500	2012-09-19	9418.8	10782.0
9739	D500	2012-09-20	9365.7	10782.0
9740	D500	2012-09-21	9445.9	10782.0
9741	D500	2012-09-22	9497.9	10782.0
9742	D500	2012-09-23	9545.3	10782.0

9743 rows × 4 columns