Internal methods for tokenization providing default and legacy methods for text segmentation.
tokenize_word(x, split_hyphens = FALSE, verbose = quanteda_options("verbose"))
tokenize_word1(x, split_hyphens = FALSE, verbose = quanteda_options("verbose"))
tokenize_character(x, ...)
tokenize_sentence(x, ..., verbose = FALSE)
tokenize_fasterword(x, ...)
tokenize_fastestword(x, ...)
(named) character; input texts
logical; if FALSE
, do not split words that are
connected by hyphenation and hyphenation-like characters in between words,
e.g. "self-aware"
becomes c("self", "-", "aware")
if TRUE
, print timing messages to the console
used to pass arguments among the functions
a list of characters corresponding to the (most conservative)
tokenization, including whitespace where applicable; except for
tokenize_word1()
, which is a special tokenizer for Internet language that
includes URLs, #hashtags, @usernames, and email addresses.
if (FALSE) {
txt <- c(doc1 = "Tweet https://quanteda.io using @quantedainit and #rstats.",
doc2 = "The £1,000,000 question.",
doc4 = "Line 1.\nLine2\n\nLine3.",
doc5 = "?",
doc6 = "Self-aware machines! \U0001f600")
tokenize_word(txt)
tokenize_word(txt, split_hyphens = TRUE)
tokenize_word2(txt, split_hyphens = FALSE)
tokenize_word2(txt, split_hyphens = TRUE)
tokenize_fasterword(txt)
tokenize_fastestword(txt)
tokenize_sentence(txt)
tokenize_character(txt[2])
}