Source code for src.data._utils

import pandas as pd


[docs]def query_es(client, body=None, index_query='*', scroll='2s'):
    """
    Queries an Elastic Search database to get all the results of a query.

    Parameters
    ----------
    client : elasticsearch.Elasticsearch
    body : dict or NoneType, default=None
        The search definition using the Query DSL.
    index_query : str, default='*'
        Search over indices matching passed index query.
    scroll : str, default='2s'
        Length of time to keep search context.

    Returns
    -------
    df : DataFrame
        Pandas DataFrame with data matching the passed query.
    """
    data = []

    # iterate over the list of Elasticsearch indices
    indices = client.indices.get_alias(index_query)
    for index in indices:

        # make a search() request to get all docs in the index
        resp = client.search(
            index=index,
            body=body,
            scroll=scroll  # length of time to keep search context
        )

        data.extend(resp['hits']['hits'])

        # keep track of pass scroll _id
        old_scroll_id = resp['_scroll_id']

        # use a 'while' iterator to loop over document 'hits'
        while len(resp['hits']['hits']):

            # make a request using the Scroll API
            resp = client.scroll(
                scroll_id=old_scroll_id,
                scroll=scroll
            )

            data.extend(resp['hits']['hits'])

            # keep track of pass scroll _id
            old_scroll_id = resp['_scroll_id']

    return pd.DataFrame([i['_source'] for i in data])