Source code for src.data._pull_data

"""
Downloads and stores tweets as .json files in its raw format.

The data is stored "as is". The remaining ETL steps can be found
in separate scripts.
"""

import logging
import json
from searchtweets import (
    load_credentials,
    gen_rule_payload,
    collect_results,
    ResultStream
)
import os

log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
logging.basicConfig(level=logging.INFO, format=log_fmt)


[docs]def pull_tweets( query, from_date, to_date, save_path, credentials_path, yaml_key, file_name=None, results_per_call=500, max_results=3000, verbose=False, **kwargs ): """ Pulls data (i.e., tweets and user info) from Twitter using its API. The data received from the API is stored in its original form (JSON) without performing any type of preprocessing. Parameters ---------- query : str Query passed to the Twitter API to fecth Tweets. from_date : str or None Date format as specified by `convert_utc_time` for the starting time of your search. to_date : str or None Date format as specified by `convert_utc_time` for the end time of your search. save_path : str Path where the raw data will be stored. credentials_path : str Path for the yaml file with the Twitter API credentials. yaml_key : str Key within the yaml file containing the Twitter API credentials to be used. file_name : str or None, default=None Name of the json file saved containing the data dump. If None, the named will be assigned as a function of `query`, `from_date` and `to_date`. results_per_call : int, default=500 Number of Tweets returned per call. max_results : int, default=3000 Maximum number of Tweets to be pulled. verbose : int or bool, default=False Controls the verbosity when pulling data. Returns ------- None : NoneType """ logger = logging.getLogger(__name__) logger.propagate = verbose logger.info('Pulling raw Twitter data') search_args = load_credentials( filename=credentials_path, yaml_key=yaml_key ) rule = gen_rule_payload( query, results_per_call=results_per_call, from_date=from_date, to_date=to_date ) rs = ResultStream( rule_payload=rule, max_results=max_results, **search_args ) if file_name is None: file_name = f'SAMPLE_DATA_QUERY_{query}_'\ + f'FROMDATE_{from_date}_TODATE_{to_date}.json' with open(os.path.join(save_path, file_name), 'a', encoding='utf-8') as f: for tweet in rs.stream(): json.dump(tweet, f) f.write('\n') logger.info('Data successfuly saved at' + f'\"{os.path.join(save_path, file_name)}\"') return None
[docs]def count_tweets( query, from_date, to_date, credentials_path, yaml_key, count_bucket="day", results_per_call=500, verbose=False, **kwargs ): """ Returns the number of existing Tweets for a given query and time frame. Since this function doesn't pull tweets, this is a safe option to check the effectiveness of your filters without exhausting the API's capacity. Parameters ---------- query : str Query passed to the Twitter API to fecth Tweets. from_date : str or None Date format as specified by `convert_utc_time` for the starting time of your search. to_date : str or None Date format as specified by `convert_utc_time` for the end time of your search. credentials_path : str Path for the yaml file with the Twitter API credentials. yaml_key : str Key within the yaml file containing the Twitter API credentials to be used. count_bucket : str or None, default="day" If using the counts api endpoint, will define the count bucket for which tweets are aggregated. results_per_call : int, default=500 Number of Tweets returned per call. verbose : int or bool, default=False Controls the verbosity when pulling the tweet count. Returns ------- counts : dict Number of existing tweets for each bucket. """ logger = logging.getLogger(__name__) logger.propagate = verbose logger.info('Counting Tweets') search_args = load_credentials( credentials_path, yaml_key=yaml_key ) count_rule = gen_rule_payload( query, from_date=from_date, to_date=to_date, count_bucket=count_bucket, results_per_call=results_per_call ) counts = collect_results(count_rule, result_stream_args=search_args) return counts