Source code for src.models._user2vec

import pandas as pd
from gensim.models.doc2vec import Doc2Vec
from ._utils import tokenize
from tqdm import tqdm


tqdm.pandas()


[docs]class User2Vec(Doc2Vec): """ Generates vectors for each user in the dataset. """
[docs] def __init__(self, vector_size, min_count, epochs, **kwargs): super().__init__( vector_size=vector_size, min_count=min_count, epochs=epochs, **kwargs )
def infer_user_vector(self, doc_words, **kwargs): """ Computes the user vector. Parameters ---------- doc_words : list or pd.Series Documents belonging to the given user. Returns ------- user_vector : np.ndarray User Vector. """ srs_doc_words = pd.Series(doc_words) # Vectorize docs within doc_words doc_vectors = srs_doc_words.apply( lambda doc: self.infer_vector(tokenize(doc), **kwargs) ) return doc_vectors.mean(0) def infer_user_vectors(self, users, doc_words): """ Computes the user vectors for a set of users and documents. Parameters ---------- users : list or pd.Series User identifiers for each document. Will be used to aggregate the documents per each user. doc_words : list or pd.Series Documents in dataset. Returns ------- user_vector : np.ndarray User Vector. """ tweet_vectors = pd.Series(doc_words).progress_apply( lambda doc: self.infer_vector(tokenize(doc)) ).tolist() df_vectors = pd.DataFrame(tweet_vectors)\ .join( pd.Series(users, name='users').reset_index(drop=True) ).groupby('users').mean() user_ids = df_vectors.index.values user_vectors = df_vectors.values return user_ids, user_vectors