Source code for src.models._user2vec

import pandas as pd
from gensim.models.doc2vec import Doc2Vec
from ._utils import tokenize
from tqdm import tqdm


tqdm.pandas()


[docs]class User2Vec(Doc2Vec):
    """
    Generates vectors for each user in the dataset.
    """
[docs]    def __init__(self, vector_size, min_count, epochs, **kwargs):

        super().__init__(
            vector_size=vector_size,
            min_count=min_count,
            epochs=epochs,
            **kwargs
        )

    def infer_user_vector(self, doc_words, **kwargs):
        """
        Computes the user vector.

        Parameters
        ----------
        doc_words : list or pd.Series
            Documents belonging to the given user.

        Returns
        -------
        user_vector : np.ndarray
            User Vector.
        """
        srs_doc_words = pd.Series(doc_words)

        # Vectorize docs within doc_words
        doc_vectors = srs_doc_words.apply(
            lambda doc: self.infer_vector(tokenize(doc), **kwargs)
        )

        return doc_vectors.mean(0)

    def infer_user_vectors(self, users, doc_words):
        """
        Computes the user vectors for a set of users and documents.

        Parameters
        ----------
        users : list or pd.Series
            User identifiers for each document. Will be used to aggregate the
            documents per each user.
        doc_words : list or pd.Series
            Documents in dataset.

        Returns
        -------
        user_vector : np.ndarray
            User Vector.
        """
        tweet_vectors = pd.Series(doc_words).progress_apply(
            lambda doc: self.infer_vector(tokenize(doc))
        ).tolist()

        df_vectors = pd.DataFrame(tweet_vectors)\
            .join(
                pd.Series(users, name='users').reset_index(drop=True)
            ).groupby('users').mean()

        user_ids = df_vectors.index.values
        user_vectors = df_vectors.values

        return user_ids, user_vectors