R Markdown

bigram

toks2 <- tokens(data_corpus_inaugural)
toks2 <- tokens_remove(toks2, stopwords("english"), padding = TRUE)
microbenchmark::microbenchmark(path_R = textstat_collocations(toks2, size = 2, tolower = FALSE, method="all", path=1),  
path_C = textstat_collocations(toks2, size = 2, tolower = FALSE, method="all", path=2), 
times = 2, unit = "relative")
## Unit: relative
##    expr      min       lq     mean   median       uq      max neval
##  path_R 2.725511 2.725511 2.665482 2.665482 2.609778 2.609778     2
##  path_C 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000     2

compare minimum counts

toks2 <- tokens(quantedaData::data_corpus_SOTU)
toks2 <- tokens_remove(toks2, stopwords("english"), padding = TRUE)
microbenchmark::microbenchmark(path_R = textstat_collocations(toks2, size = 3, tolower = FALSE, method="lambda", min_count = 2, path=2),  
path_C = textstat_collocations(toks2, size = 3, tolower = FALSE, method="lambda", min_count = 1,  path=2), 
times = 2, unit = "relative")
## Warning in evalq((function (..., call. = TRUE, immediate. = FALSE,
## noBreaks. = FALSE, : Warning: ipf algorithm did not converge for at least
## once

## Warning in evalq((function (..., call. = TRUE, immediate. = FALSE,
## noBreaks. = FALSE, : Warning: ipf algorithm did not converge for at least
## once

## Warning in evalq((function (..., call. = TRUE, immediate. = FALSE,
## noBreaks. = FALSE, : Warning: ipf algorithm did not converge for at least
## once

## Warning in evalq((function (..., call. = TRUE, immediate. = FALSE,
## noBreaks. = FALSE, : Warning: ipf algorithm did not converge for at least
## once
## Unit: relative
##    expr      min       lq     mean   median       uq      max neval
##  path_R  1.00000  1.00000  1.00000  1.00000  1.00000  1.00000     2
##  path_C 11.93741 11.93741 11.98982 11.98982 12.04222 12.04222     2

trigram

toks2 <- tokens(data_corpus_inaugural)
toks2 <- tokens_remove(toks2, stopwords("english"), padding = TRUE)

microbenchmark::microbenchmark(path_R = textstat_collocations(toks2, size = 3, tolower = FALSE, method="all", path=1),  
path_C = textstat_collocations(toks2, size = 3, tolower = FALSE, method="all", path=2), 
times = 2, unit = "relative")
## Unit: relative
##    expr     min      lq    mean  median       uq      max neval
##  path_R 3.02222 3.02222 3.02029 3.02029 3.018381 3.018381     2
##  path_C 1.00000 1.00000 1.00000 1.00000 1.000000 1.000000     2

4-grams

toks2 <- tokens(data_corpus_inaugural)
toks2 <- tokens_remove(toks2, stopwords("english"), padding = TRUE)

microbenchmark::microbenchmark(path_R = textstat_collocations(toks2, size = 4, tolower = FALSE, method="all", path=1),  
path_C = textstat_collocations(toks2, size = 4, tolower = FALSE, method="all", path=2), 
times = 2, unit = "relative")
## Unit: relative
##    expr      min       lq     mean   median       uq      max neval
##  path_R 1.517191 1.517191 1.536343 1.536343 1.555169 1.555169     2
##  path_C 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000     2