Latent Semantic Analysis

TODO

As usual we will use built-in text2vec::moview_review dataset. Let’s clean it a little bit and create DTM:

library(stringr)
library(text2vec)
data("movie_review")
# select 500 rows for faster running times
movie_review_test = movie_review[501:1000, ]
movie_review_train = movie_review[1:500, ]
prep_fun = function(x) {
  x %>% 
    # make text lower case
    str_to_lower %>% 
    # remove non-alphanumeric symbols
    str_replace_all("[^[:alnum:]]", " ") %>% 
    # collapse multiple spaces
    str_replace_all("\\s+", " ")
}
movie_review_train$review = prep_fun(movie_review_train$review)
it = itoken(movie_review_train$review, progressbar = FALSE)
v = create_vocabulary(it) %>% 
  prune_vocabulary(doc_proportion_max = 0.1, term_count_min = 5)
vectorizer = vocab_vectorizer(v)
dtm = create_dtm(it, vectorizer)

Now we will perform tf-idf scaling and the fit and apply LSA model:

tfidf = TfIdf$new()
lsa = LSA$new(n_topics = 10)

# pipe friendly transformation
dtm_tfidf_lsa = dtm %>% 
  fit_transform(tfidf) %>% 
  fit_transform(lsa)

And we can elegantly apply exactly the same transformation to new data. Elegantly with “not-a-pipe” %>%:

new_data = movie_review_test
new_data_dtm_tfidf_lsa = 
  new_data$review %>% 
  itoken(preprocessor = prep_fun, progressbar = FALSE) %>% 
  create_dtm(vectorizer) %>% 
  transform(tfidf) %>% 
  transform(lsa)
head(new_data_dtm_tfidf_lsa)
##           [,1]       [,2]         [,3]         [,4]         [,5]
## 1 0.0010757498 0.07883588 -0.012792492  0.001803077 -0.016498829
## 2 0.0010309045 0.07827933 -0.002480813  0.007431587 -0.013045284
## 3 0.0008995570 0.09534925 -0.004880675 -0.007976679 -0.033347235
## 4 0.0009023917 0.09403420  0.012338197  0.007145261 -0.007353982
## 5 0.0010801880 0.07638067 -0.010959332  0.011642504 -0.017630506
## 6 0.0009748510 0.08042964 -0.008732528  0.007173264 -0.014004544
##            [,6]        [,7]          [,8]         [,9]        [,10]
## 1  0.0103262362 0.009852323 -0.0064973268  0.002294977  0.003113058
## 2 -0.0001511818 0.007605538 -0.0076489837  0.005333738  0.003353584
## 3 -0.0028268780 0.016753258 -0.0423168721 -0.521265026  0.038670324
## 4  0.0171269927 0.005365014 -0.0002391057 -0.007158379  0.012387447
## 5 -0.0034931519 0.005086364  0.0020006889 -0.001887502 -0.003574490
## 6 -0.0124146267 0.003978910 -0.0209399126  0.008284681  0.011476810

Latent Dirichlet Allocation

TODO

tokens = movie_review$review %>% 
  tolower %>% 
  word_tokenizer
# turn off progressbar because it won't look nice in rmd
it = itoken(tokens, ids = movie_review$id, progressbar = FALSE)
v = create_vocabulary(it) %>% 
  prune_vocabulary(term_count_min = 10, doc_proportion_max = 0.2)
vectorizer = vocab_vectorizer(v)
dtm = create_dtm(it, vectorizer, type = "lda_c")

lda_model = 
  LDA$new(n_topics = 10, vocabulary = v, 
          doc_topic_prior = 0.1, topic_word_prior = 0.01)
doc_topic_distr = 
  lda_model$fit_transform(dtm, n_iter = 1000, convergence_tol = 0.01, 
                          check_convergence_every_n = 10)
lda_model$plot()
LDAvis
text2vec is created by Dmitry Selivanov and contributors. © 2016.
If you have found any BUGS please report them here.