Source code for pyterrier_alpha.dataframe_builder

"""Utility to build a DataFrame from a sequence of dictionaries."""

from itertools import chain
from typing import Any, Dict, List, Optional

import numpy as np
import pandas as pd



[docs]
class DataFrameBuilder:
    """Utility to build a DataFrame from a sequence of dictionaries.

    .. versionadded:: 0.1.0

    The dictionaries must have the same keys, and the values must be either scalars, or lists of the same length.
    """
    def __init__(self, columns: List[str]):
        """Create a DataFrameBuilder with the given columns.

        Args:
            columns: the columns of the resulting DataFrame, required to be present in each
                call to :meth:`~pyterrier_alpha.DataFrameBuilder.extend`.
        """
        if '_index' not in columns:
            columns = ['_index'] + columns
        self._data = {c: [] for c in columns}
        self._auto_index = 0


[docs]
    def extend(self, values: Dict[str, Any]) -> None:
        """Add a dictionary of values to the DataFrameBuilder.

        .. versionchanged:: 0.4.1
            Allow all fields to be scalars (assumes length of 1).

        .. versionchanged:: 0.7.0
            Automatically infer the ``_index`` field.

        .. versionchanged:: 0.9.2
            Allow broadcasting of input lists with the length of 1. This allows support for inputs like arrays, which
            are not meant to be treated as lists themselves.

        Args:
            values: a dictionary of values to add to the DataFrameBuilder. The keys must be the same as the columns
                provided to the constructor, and the values must be either scalars, or lists (all of the same length).
        """
        if '_index' not in values.keys():
            values['_index'] = self._auto_index
            self._auto_index += 1
        assert all(c in values.keys() for c in self._data), f"all columns must be provided: {list(self._data)}"
        lens = {k: len(v) for k, v in values.items() if hasattr(v, '__len__') and not isinstance(v, str) and len(v) > 1}
        if any(lens):
            first_len = list(lens.values())[0]
        else:
            first_len = 1 # if nothing has a len, everything is given a length of 1
        assert all(i == first_len for i in lens.values()), f"all values must have the same length {lens}"
        for k, v in values.items():
            if k not in lens:
                if isinstance(v, (tuple, list)) and len(v) == 1:
                    self._data[k].append(v * first_len)
                else:
                    self._data[k].append([v] * first_len)
            elif isinstance(v, pd.Series):
                self._data[k].append(v.values)
            else:
                self._data[k].append(v)



[docs]
    def to_df(self, merge_on_index: Optional[pd.DataFrame] = None) -> pd.DataFrame:
        """Convert the DataFrameBuilder to a DataFrame.

        .. versionchanged:: 0.1.1
            Added ``merge_on_index`` argument.

        .. versionchanged:: 0.1.1
            Columns from ``merge_on_index`` come first.

        .. versionchanged:: 0.9.3
            Fixed bug with columns that have values of numpy arrays

        Args:
            merge_on_index: an optional DataFrame to merge the resulting DataFrame on.

        Returns:
            A DataFrame with the values added to the DataFrameBuilder.
        """
        result = pd.DataFrame({
            k: (np.concatenate(v)
                if len(v) > 0 and not isinstance(v[0][0], np.ndarray) else
                list(chain.from_iterable(v))
               )
            for k, v in self._data.items()
        })
        if merge_on_index is not None:
            merge_on_index = merge_on_index.reset_index(drop=True)
            result = result.assign(**{
                col: merge_on_index[col].iloc[result['_index']].values
                for col in merge_on_index.columns
                if col not in result.columns
            })
            merge_columns = set(merge_on_index.columns)
            column_order = list(merge_on_index.columns) + [c for c in result.columns if c not in merge_columns]
            result = result[column_order]
        result = result.drop(columns=['_index'])
        return result