An alias of strj_tokenize(engine = "budoux")
.
Usage
strj_segment(text, format = c("list", "data.frame"), split = FALSE)
Examples
strj_segment(
paste0(
"\u3042\u306e\u30a4\u30fc\u30cf\u30c8",
"\u30fc\u30f4\u30a9\u306e\u3059\u304d",
"\u3068\u304a\u3063\u305f\u98a8"
)
)
#> $`1`
#> [1] "あの" "イーハトーヴォの" "すきとおった" "風"
#>
strj_segment(
paste0(
"\u3042\u306e\u30a4\u30fc\u30cf\u30c8",
"\u30fc\u30f4\u30a9\u306e\u3059\u304d",
"\u3068\u304a\u3063\u305f\u98a8"
),
format = "data.frame"
)
#> doc_id token
#> 1 1 あの
#> 2 1 イーハトーヴォの
#> 3 1 すきとおった
#> 4 1 風