Source code for src.models._utils

from gensim.models.doc2vec import TaggedDocument
from gensim.utils import simple_preprocess


[docs]def tokenize(doc, tag=None): """ Utility function to Tokenize a single tweet. Parameters ---------- doc : str Text to be tokenized. tag : int, str or NoneType, default=None Document identifier. If None, returns list of tokens instead of tagged document. Returns ------- document : list or gensim.models.doc2vec.TaggedDocument Tokenized, tagged document """ tokens = simple_preprocess(doc) if tag is None: return tokens else: return TaggedDocument(tokens, [tag])