from ppc_robot_lib.steps.abstract_step import AbstractStep
from ppc_robot_lib.tasks import TaskContextInterface, StepPerformance
[docs]
class AggregateByColumnStep(AbstractStep):
"""
Group by table by specific columns and perform an aggregation on the specified column. The ``aggregation`` argument
must be a name of the aggregation -- ```sum``, ``count``, ``avg`` etc. See
`Pandas GroupBy Documentation <https://pandas.pydata.org/pandas-docs/stable/api.html#groupby>`_ for a complete list.
Unlike :py:class:`ppc_robot_lib.steps.transformations.group_by_and_aggregate.GroupByAndAggregateStep`, this
step returns a :py:class:`pandas.core.groupby.DataFrameGroupBy`, not a :py:class:`pandas.DataFrame`. Be careful
when using the result.
See :ref:`pandas:groupby.split` for more information about working with
:py:class:`pandas.core.groupby.DataFrameGroupBy` objects.
**Example:**
Let's assume the following table named ``input``:
======== ======== ===========
Campaign Ad Group Impressions
======== ======== ===========
Camp 1 A 10
Camp 1 B 30
Camp 2 A 15
Camp 2 A 50
Camp 2 A 0
======== ======== ===========
By executing::
>>> from ppc_robot_lib.steps.transformations import AggregateByColumnStep
>>> AggregateByColumnStep("input", group_by="Campaign", aggregation="sum", column="Impressions",
... output_table="output")
We would get the following :py:class:`pandas.core.groupby.DataFrameGroupBy` object:
========== ===========
Campaign Impressions
========== ===========
Camp 1 40
Camp 2 65
========== ===========
If you omit the column, the aggregation will be applied on all columns that are not present in the ``group_by``
argument.
"""
def __init__(
self,
input_table: str,
group_by: str | list[str] | tuple[str, ...],
aggregation: str,
output_table: str,
column: str = None,
sort=False,
):
"""
:param input_table: Table to used for grouping and aggregation.
:param group_by: Column name or list of columns for aggregation.
:param aggregation: Aggregation to apply, must be a name of a method from
:py:class:`pandas.core.groupby.DataFrameGroupBy`, e.g. ``sum``, ``count`` or ``avg``. Complete list
can be found in the
`Pandas GroupBy Documentation <https://pandas.pydata.org/pandas-docs/stable/api.html#groupby>`_.
:param output_table: Outout table.
:param column: Column to use for aggregation.
:param sort: Set to ``True`` if you would like to sort the result by the columns used for grouping.
"""
self.input_table = input_table
self.group_by = group_by
self.aggregation = aggregation
self.output_table = output_table
self.column = column
self.sort = sort
def execute(self, task_ctx: TaskContextInterface) -> StepPerformance:
table = task_ctx.work_set.get_table(self.input_table)
grouped = table.groupby(by=self.group_by, sort=self.sort)
if self.column:
grouped = grouped[self.column]
aggregation = getattr(grouped, self.aggregation)
aggregated = aggregation()
task_ctx.work_set.set_table(self.output_table, aggregated)
row_count = len(table.index)
return StepPerformance(table, rows_in=row_count, rows_out=len(aggregated))