Source code for pyterrier.debug

import pandas as pd
from . import Transformer
from typing import List, Optional


[docs]
def print_columns(by_query : bool = False, message : Optional[str] = None) -> Transformer:
    """
    Returns a transformer that can be inserted into pipelines that can print the column names of the dataframe
    at this stage in the pipeline:

    :param by_query: whether to display for each query. Defaults to False.
    :param message: whether to display a message before printing. Defaults to None, which means no message. This
       is useful when ``print_columns()`` is being used multiple times within a pipeline 
     

    Example::
    
        pipe = (
            bm25
            >> pt.debug.print_columns() 
            >> pt.rewrite.RM3() 
            >> pt.debug.print_columns()
            bm25)

    When the above pipeline is executed, two sets of columns will be displayed
     - `["qid", "query", "docno", "rank", "score"]`  - the output of BM25, a ranking of documents
     - `["qid", "query", "query_0"]`   - the output of RM3, a reformulated query
    
        
    """
    import pyterrier as pt
    def _do_print(df):
        if message is not None:
            print(message)
        print(df.columns)
        return df
    return pt.apply.by_query(_do_print) if by_query else pt.apply.generic(_do_print) 



[docs]
def print_num_rows(
        by_query : bool = True, 
        msg : str = "num_rows") -> Transformer:
    """
    Returns a transformer that can be inserted into pipelines that can print the number of rows names of the dataframe
    at this stage in the pipeline:

    :param by_query: whether to display for each query. Defaults to True.
    :param message: whether to display a message before printing. Defaults to "num_rows". This
       is useful when ``print_columns()`` is being used multiple times within a pipeline 
     
    Example::
    
        pipe = (
            bm25
            >> pt.debug.print_num_rows() 
            >> pt.rewrite.RM3() 
            >> pt.debug.print_num_rows()
            bm25)

    When the above pipeline is executed, the following output will be displayed
     - `num_rows 1: 1000` - the output of BM25, a ranking of documents
     - `num_rows 1: 1` - the output of RM3, the reformulated query
    
        
    """

    import pyterrier as pt
    def _print_qid(df):
        qid = df.iloc[0].qid
        print("%s %s: %d" % (msg, qid, len(df)))
        return df

    def _print(df):
        print("%s: %d" % (msg, len(df)))
        return df

    if by_query:
        return pt.apply.by_query(_print_qid, add_ranks=False)
    else:
        return pt.apply.generic(_print, add_ranks=False)



[docs]
def print_rows(
        by_query : bool = True, 
        jupyter: bool = True, 
        head : int = 2, 
        message : Optional[str] = None, 
        columns : Optional[List[str]] = None) -> Transformer:
    """
    Returns a transformer that can be inserted into pipelines that can print some of the dataframe
    at this stage in the pipeline:

    :param by_query: whether to display for each query. Defaults to True.
    :param jupyter: Whether to use IPython's display function to display the dataframe. Defaults to True.
    :param head: The number of rows to display. None means all rows.
    :param columns: Limit the columns for which data is displayed. Default of None displays all columns.
    :param message: whether to display a message before printing. Defaults to None, which means no message. This
       is useful when ``print_rows()`` is being used multiple times within a pipeline 

    Example::

        pipe = (
            bm25
            >> pt.debug.print_rows() 
            >> pt.rewrite.RM3() 
            >> pt.debug.print_rows()
            bm25)
     
    """
    import pyterrier as pt
    def _do_print(df):
        if message is not None:
            print(message)
        render = df if head is None else df.head(head)
        if columns is not None:
            render = render[columns]
        if jupyter:
            from IPython.display import display # type: ignore
            display(render)
        else:
            print(render)
        return df
    return pt.apply.by_query(_do_print) if by_query else pt.apply.generic(_do_print) 



[docs]
class pdb(Transformer):
    """Returns a transformer that starts an interactive `pdb <https://docs.python.org/3/library/pdb.html>`__
    debugger session. The interactive session can be used to inspect the dataframe at this stage in the pipeline.

    Example::

        pipe = (
            bm25
            >> pt.debug.pdb()
            >> pt.rewrite.RM3() 
            >> pt.debug.pdb()
            bm25)
    """
    def transform(self, inp: pd.DataFrame) -> pd.DataFrame:
        import pdb
        pdb.set_trace()
        return inp