Source code for src.features._preprocess
"""
Preprocessing functions to convert merged dataframe full text into text for NLP
use
"""
import nltk
nltk.download('vader_lexicon')
import string
from googletrans import Translator
from nltk.corpus import stopwords
from preprocessor import api
from nltk.sentiment.vader import SentimentIntensityAnalyzer
[docs]def translate_func(x, text, lang):
"""
Function to use the .apply method on all rows of a dataframe to translate
text. Performs better then itterrow.
Parameters
----------
x : Series or dict
Pandas DataFrame row.
text : str
Name of the key containing the text to be translated.
lang : str
Name of the key containing the language of the text.
Returns
-------
process : str
Text translated into English.
"""
if x[lang] != 'en':
process = translate_tweet(x[text], x[lang])
else:
process = x[text]
return process
[docs]def preprocessDataFrame(df):
"""
Function to run the preprocessing pipeline on all tweets to generate
the feature "full_text_processed": Translating tweets to English, removing
stopwords & lemmatization, removing URLs and reserved words, lowercasing &
punctuation removal and VADER sentiment analysis.
Parameters
----------
df : DataFrame
Transformed DataFrame with original tweets
Returns:
df : DataFrame
DataFrame with processed tweets
"""
df['full_text_processed'] = df.apply(
lambda x: translate_func(x, 'full_text', 'lang'),
axis=1
)
# for some reason some rows are type float so make sure nothing will crash
df['full_text_processed'] = df['full_text_processed'].astype(str)
api.set_options('urls', 'reserved_words')
df['full_text_processed'] = df['full_text_processed'].apply(
lambda x: api.clean(x)
)
df['full_text_processed'] = df['full_text_processed'].apply(
lambda x: x.lower()
)
def remove_punct(text):
table = str.maketrans('', '', string.punctuation)
return text.translate(table)
df['full_text_processed'] = df['full_text_processed'].apply(
lambda x: remove_punct(x)
)
lemmatizer = nltk.stem.WordNetLemmatizer()
df['full_text_processed'] = df['full_text_processed'].apply(
lambda x: ' '.join([lemmatizer.lemmatize(w) for w in x.split()])
)
stop_words = set(stopwords.words('english'))
df['full_text_processed'] = df['full_text_processed'].apply(
lambda x: ' '.join(
[word for word in x.split() if word not in stop_words]
)
)
sid = SentimentIntensityAnalyzer()
def create_sentiment(x, text):
return sid.polarity_scores(text)['compound']
# add sentiment as part of preprocessing
df['sentiment'] = df.apply(
lambda x: create_sentiment(x, x['full_text_processed']), axis=1
)
return df