Source code for ppc_robot_lib.steps.transformations.aggregate_by_column

from ppc_robot_lib.steps.abstract_step import AbstractStep
from ppc_robot_lib.tasks import TaskContextInterface, StepPerformance



[docs]
class AggregateByColumnStep(AbstractStep):
    """
    Group by table by specific columns and perform an aggregation on the specified column. The ``aggregation`` argument
    must be a name of the aggregation -- ```sum``, ``count``, ``avg`` etc. See
    `Pandas GroupBy Documentation <https://pandas.pydata.org/pandas-docs/stable/api.html#groupby>`_ for a complete list.

    Unlike :py:class:`ppc_robot_lib.steps.transformations.group_by_and_aggregate.GroupByAndAggregateStep`, this
    step returns a :py:class:`pandas.core.groupby.DataFrameGroupBy`, not a :py:class:`pandas.DataFrame`. Be careful
    when using the result.

    See :ref:`pandas:groupby.split` for more information about working with
    :py:class:`pandas.core.groupby.DataFrameGroupBy` objects.

    **Example:**

    Let's assume the following table named ``input``:

    ======== ======== ===========
    Campaign Ad Group Impressions
    ======== ======== ===========
    Camp 1   A        10
    Camp 1   B        30
    Camp 2   A        15
    Camp 2   A        50
    Camp 2   A        0
    ======== ======== ===========

    By executing::

        >>> from ppc_robot_lib.steps.transformations import AggregateByColumnStep
        >>> AggregateByColumnStep("input", group_by="Campaign", aggregation="sum", column="Impressions",
        ...                       output_table="output")

    We would get the following :py:class:`pandas.core.groupby.DataFrameGroupBy` object:

    ========== ===========
    Campaign   Impressions
    ========== ===========
    Camp 1     40
    Camp 2     65
    ========== ===========

    If you omit the column, the aggregation will be applied on all columns that are not present in the ``group_by``
    argument.

    """

    def __init__(
        self,
        input_table: str,
        group_by: str | list[str] | tuple[str, ...],
        aggregation: str,
        output_table: str,
        column: str = None,
        sort=False,
    ):
        """
        :param input_table: Table to used for grouping and aggregation.
        :param group_by: Column name or list of columns for aggregation.
        :param aggregation: Aggregation to apply, must be a name of a method from
            :py:class:`pandas.core.groupby.DataFrameGroupBy`, e.g. ``sum``, ``count`` or ``avg``. Complete list
            can be found in the
            `Pandas GroupBy Documentation <https://pandas.pydata.org/pandas-docs/stable/api.html#groupby>`_.
        :param output_table: Outout table.
        :param column: Column to use for aggregation.
        :param sort: Set to ``True`` if you would like to sort the result by the columns used for grouping.
        """
        self.input_table = input_table
        self.group_by = group_by
        self.aggregation = aggregation
        self.output_table = output_table
        self.column = column
        self.sort = sort

    def execute(self, task_ctx: TaskContextInterface) -> StepPerformance:
        table = task_ctx.work_set.get_table(self.input_table)
        grouped = table.groupby(by=self.group_by, sort=self.sort)
        if self.column:
            grouped = grouped[self.column]
        aggregation = getattr(grouped, self.aggregation)
        aggregated = aggregation()
        task_ctx.work_set.set_table(self.output_table, aggregated)

        row_count = len(table.index)
        return StepPerformance(table, rows_in=row_count, rows_out=len(aggregated))