Source code for ppc_robot_lib.steps.transformations.aggregate_by_column

from ppc_robot_lib.steps.abstract_step import AbstractStep
from ppc_robot_lib.tasks import TaskContextInterface, StepPerformance


[docs] class AggregateByColumnStep(AbstractStep): """ Group by table by specific columns and perform an aggregation on the specified column. The ``aggregation`` argument must be a name of the aggregation -- ```sum``, ``count``, ``avg`` etc. See `Pandas GroupBy Documentation <https://pandas.pydata.org/pandas-docs/stable/api.html#groupby>`_ for a complete list. Unlike :py:class:`ppc_robot_lib.steps.transformations.group_by_and_aggregate.GroupByAndAggregateStep`, this step returns a :py:class:`pandas.core.groupby.DataFrameGroupBy`, not a :py:class:`pandas.DataFrame`. Be careful when using the result. See :ref:`pandas:groupby.split` for more information about working with :py:class:`pandas.core.groupby.DataFrameGroupBy` objects. **Example:** Let's assume the following table named ``input``: ======== ======== =========== Campaign Ad Group Impressions ======== ======== =========== Camp 1 A 10 Camp 1 B 30 Camp 2 A 15 Camp 2 A 50 Camp 2 A 0 ======== ======== =========== By executing:: >>> from ppc_robot_lib.steps.transformations import AggregateByColumnStep >>> AggregateByColumnStep("input", group_by="Campaign", aggregation="sum", column="Impressions", ... output_table="output") We would get the following :py:class:`pandas.core.groupby.DataFrameGroupBy` object: ========== =========== Campaign Impressions ========== =========== Camp 1 40 Camp 2 65 ========== =========== If you omit the column, the aggregation will be applied on all columns that are not present in the ``group_by`` argument. """ def __init__( self, input_table: str, group_by: str | list[str] | tuple[str, ...], aggregation: str, output_table: str, column: str = None, sort=False, ): """ :param input_table: Table to used for grouping and aggregation. :param group_by: Column name or list of columns for aggregation. :param aggregation: Aggregation to apply, must be a name of a method from :py:class:`pandas.core.groupby.DataFrameGroupBy`, e.g. ``sum``, ``count`` or ``avg``. Complete list can be found in the `Pandas GroupBy Documentation <https://pandas.pydata.org/pandas-docs/stable/api.html#groupby>`_. :param output_table: Outout table. :param column: Column to use for aggregation. :param sort: Set to ``True`` if you would like to sort the result by the columns used for grouping. """ self.input_table = input_table self.group_by = group_by self.aggregation = aggregation self.output_table = output_table self.column = column self.sort = sort def execute(self, task_ctx: TaskContextInterface) -> StepPerformance: table = task_ctx.work_set.get_table(self.input_table) grouped = table.groupby(by=self.group_by, sort=self.sort) if self.column: grouped = grouped[self.column] aggregation = getattr(grouped, self.aggregation) aggregated = aggregation() task_ctx.work_set.set_table(self.output_table, aggregated) row_count = len(table.index) return StepPerformance(table, rows_in=row_count, rows_out=len(aggregated))