Source code for src.features._preprocess

"""
Preprocessing functions to convert merged dataframe full text into text for NLP
use
"""

import nltk
nltk.download('vader_lexicon')
import string
from googletrans import Translator
from nltk.corpus import stopwords
from preprocessor import api
from nltk.sentiment.vader import SentimentIntensityAnalyzer


[docs]def translate_tweet(text, lang):
    """
    Translate a block of text (this function can be time consuming).

    Parameters
    ----------
    text : str
        Text to be translated.
    lang : str
        Langauge of original text


    Returns
    -------
    trans : str
        Text translated to English.
    """
    trans = Translator()
    return trans.translate(text).text


[docs]def translate_func(x, text, lang):
    """
    Function to use the .apply method on all rows of a dataframe to translate
    text. Performs better then itterrow.

    Parameters
    ----------
    x : Series or dict
        Pandas DataFrame row.
    text : str
        Name of the key containing the text to be translated.
    lang : str
        Name of the key containing the language of the text.

    Returns
    -------
    process : str
        Text translated into English.
    """
    if x[lang] != 'en':
        process = translate_tweet(x[text], x[lang])
    else:
        process = x[text]
    return process


[docs]def preprocessDataFrame(df):
    """
    Function to run the preprocessing pipeline on all tweets to generate
    the feature "full_text_processed": Translating tweets to English, removing
    stopwords & lemmatization, removing URLs and reserved words, lowercasing &
    punctuation removal and VADER sentiment analysis.

    Parameters
    ----------
    df : DataFrame
        Transformed DataFrame with original tweets

    Returns:
    df : DataFrame
        DataFrame with processed tweets
    """
    df['full_text_processed'] = df.apply(
        lambda x: translate_func(x, 'full_text', 'lang'),
        axis=1
    )

    # for some reason some rows are type float so make sure nothing will crash
    df['full_text_processed'] = df['full_text_processed'].astype(str)

    api.set_options('urls', 'reserved_words')
    df['full_text_processed'] = df['full_text_processed'].apply(
        lambda x: api.clean(x)
    )
    df['full_text_processed'] = df['full_text_processed'].apply(
        lambda x: x.lower()
    )

    def remove_punct(text):
        table = str.maketrans('', '', string.punctuation)
        return text.translate(table)

    df['full_text_processed'] = df['full_text_processed'].apply(
        lambda x: remove_punct(x)
    )

    lemmatizer = nltk.stem.WordNetLemmatizer()
    df['full_text_processed'] = df['full_text_processed'].apply(
        lambda x: ' '.join([lemmatizer.lemmatize(w) for w in x.split()])
    )

    stop_words = set(stopwords.words('english'))
    df['full_text_processed'] = df['full_text_processed'].apply(
        lambda x: ' '.join(
            [word for word in x.split() if word not in stop_words]
        )
    )

    sid = SentimentIntensityAnalyzer()

    def create_sentiment(x, text):
        return sid.polarity_scores(text)['compound']
    # add sentiment as part of preprocessing
    df['sentiment'] = df.apply(
        lambda x: create_sentiment(x, x['full_text_processed']), axis=1
    )

    return df