Source code for pyterrier_services.semantic_scholar

import os
from typing import List, Optional, Union, Tuple
from functools import partial
import pandas as pd
import requests
import pyterrier as pt
from . import http_error_retry, paginated_search, multi_query


[docs]
class SemanticScholarApi:
    """Represents a reference to the Semantic Scholar search API."""
    API_BASE_URL = 'https://api.semanticscholar.org/graph/v1'

    def __init__(self, api_key: Optional[str] = None):
        """
        Args:
            api_key: The API key for Semantic Scholar. If not provided, it will fall back on using the value from the ``S2_API_KEY`` env variable, and if that is not available, the API will be used without authentication.
        """
        self.api_key = api_key or os.environ.get('S2_API_KEY')


[docs]
    def retriever(self,
        *,
        num_results: int = 100,
        fields: List[str] = ['title', 'abstract'],
        verbose: bool = True
    ) -> pt.Transformer:
        """Returns a :class:`~pyterrier.Transformer` that retrieves articles from Semantic Scholar.

        Args:
            num_results: The number of results to retrieve. Defaults to 100.
            fields: The fields to include in the retrieved results. Defaults to ['title', 'abstract'].
            verbose: Whether to log the progress. Defaults to True.
        """
        return SemanticScholarRetriever(api=self, num_results=num_results, fields=fields, verbose=verbose)



[docs]
    def search(self,
        query: str,
        *,
        offset: int = 0,
        limit: int = 100,
        fields: List[str] = ['title', 'abstract'],
        return_next: bool = False,
        return_total: bool = False
    ) -> Union[pd.DataFrame, Tuple[pd.DataFrame, int], Tuple[pd.DataFrame, int, int]]:
        """Searches for papers on Semantic Scholar with the provided query.

        Args:
            query: The search query.
            offset: The offset of the first result to retrieve. Defaults to 0.
            limit: The maximum number of results to retrieve. Defaults to 100.
            fields: The fields to include in the retrieved results. Defaults to ['title', 'abstract'].
            return_next: Whether to return the next query URL. Defaults to False.
            return_total: Whether to return the total number of results. Defaults to False.
        """
        params = {
            'query': query,
            'offset': offset,
            'fields': ','.join(fields),
            'limit': max(min(limit, 100), 1),
        }
        headers = {'x-api-key': self.api_key} if self.api_key else {}
        http_res = requests.get(SemanticScholarApi.API_BASE_URL + '/paper/search', params=params, headers=headers)
        http_res.raise_for_status()
        http_res = http_res.json()

        if len(http_res['data']) == 0:
            result_df = pd.DataFrame(columns=['docno', *[str(f) for f in fields], 'rank', 'score'])
        else:
            result_df = pd.DataFrame(http_res['data'])
            result_df.rename(columns={'paperId': 'docno'}, inplace=True)
            result_df['rank'] = range(http_res['offset'], http_res['offset']+len(result_df))
            result_df['score'] = -result_df['rank']

        res = [result_df]
        if return_next:
            res.append(http_res.get('next'))
        if return_total:
            res.append(http_res['total'])
        if len(res) == 1:
            return res[0]
        return tuple(res)





[docs]
class SemanticScholarRetriever(pt.Transformer):
    """A :class:`~pyterrier.Transformer` retriever that queries the Semantic Scholar search API."""
    def __init__(self,
        *,
        api: Optional[SemanticScholarApi] = None,
        num_results: int = 100,
        fields: List[str] = ['title', 'abstract'],
        verbose: bool = True
    ):
        """
        Args:
            api: The Semantic Scholar api service. Defaults to a new instance of :class:`~pyterrier_services.SemanticScholarApi`.
            num_results: The number of results to retrieve per query. Defaults to 100.
            fields: The fields to include in the retrieved results. Defaults to ['title', 'abstract'].
            verbose: Whether to log the progress. Defaults to True.
        """
        self.api = api or SemanticScholarApi()
        self.num_results = num_results
        self.fields = fields
        self.verbose = verbose

    def transform(self, inp: pd.DataFrame) -> pd.DataFrame:
        return multi_query(
            paginated_search(
                http_error_retry(
                    partial(self.api.search, fields=self.fields)
                ),
                num_results=self.num_results,
            ),
            verbose=self.verbose,
            verbose_desc='SemanticScholarRetriever',
        )(inp)

    def fuse_rank_cutoff(self, k: int) -> Optional['SemanticScholarRetriever']:
        if k < self.num_results:
            return SemanticScholarRetriever(api=self.api, num_results=k, fields=self.fields, verbose=self.verbose)