Supervised Learning Using tidymodels and gibasa • gibasa

データの準備

livedoorニュースコーパスを使います。このコーパスのカテゴリ分類はかなり易しいタスクであることが知られている（というか、一部のカテゴリではそのカテゴリを同定できる単語が本文に含まれてしまっている）ので、機械学習を手軽に試すのに便利です。テキストの特徴量をもとに以下の9カテゴリの分類をします。

トピックニュース
Sports Watch
ITライフハック
家電チャンネル
MOVIE ENTER
独女通信
エスマックス
livedoor HOMME
Peachy

ldccrでデータフレームにします。

tbl <- ldccr::read_ldnws() |>
  dplyr::mutate(doc_id = as.character(dplyr::row_number()))
#> Parsing dokujo-tsushin...
#> Parsing it-life-hack...
#> Parsing kaden-channel...
#> Parsing livedoor-homme...
#> Parsing movie-enter...
#> Parsing peachy...
#> Parsing smax...
#> Parsing sports-watch...
#> Parsing topic-news...
#> Done.

ここでは、KH Coderの品詞体系における名詞・地名・人名・組織名・固有名詞・動詞・未知語を抽出し、IPA辞書に収録されている語については原形にしながら分かち書きにします。

corp <- tbl |>
  dplyr::mutate(
    text = stringi::stri_trans_nfkc(body) |>
      stringi::stri_replace_all_regex("(https?\\://[[:alnum:]\\.\\-_/]+)", "\nURL\tタグ\n") |>
      stringi::stri_replace_all_regex("[\\s]{2,}", "\n") |>
      stringi::stri_trim_both(),
    chunk = dplyr::ntile(dplyr::row_number(), 10)
  ) |>
  dplyr::group_by(chunk) |>
  dplyr::group_modify(\(df, idx) {
    data.frame(
      doc_id = df$doc_id,
      text = df$text
    ) |>
      gibasa::tokenize(text, partial = TRUE) |>
      gibasa::prettify(
        col_select = c("POS1", "POS2", "POS3", "Original")
      ) |>
      dplyr::mutate(
        pos = dplyr::case_when(
          (POS1 == "タグ") ~ "タグ",
          (is.na(Original) & stringr::str_detect(token, "^[[:alpha:]]+$")) ~ "未知語",
          (POS1 == "感動詞") ~ "感動詞",
          (POS1 == "名詞" & POS2 == "一般" & stringr::str_detect(token, "^[\\p{Han}]{1}$")) ~ "名詞C",
          (POS1 == "名詞" & POS2 == "一般" & stringr::str_detect(token, "^[\\p{Hiragana}]+$")) ~ "名詞B",
          (POS1 == "名詞" & POS2 == "一般") ~ "名詞",
          (POS1 == "名詞" & POS2 == "固有名詞" & POS3 == "地域") ~ "地名",
          (POS1 == "名詞" & POS2 == "固有名詞" & POS3 == "人名") ~ "人名",
          (POS1 == "名詞" & POS2 == "固有名詞" & POS3 == "組織") ~ "組織名",
          (POS1 == "名詞" & POS2 == "形容動詞語幹") ~ "形容動詞",
          (POS1 == "名詞" & POS2 == "ナイ形容詞語幹") ~ "ナイ形容詞",
          (POS1 == "名詞" & POS2 == "固有名詞") ~ "固有名詞",
          (POS1 == "名詞" & POS2 == "サ変接続") ~ "サ変名詞",
          (POS1 == "名詞" & POS2 == "副詞可能") ~ "副詞可能",
          (POS1 == "動詞" & POS2 == "自立" & stringr::str_detect(token, "^[\\p{Hiragana}]+$")) ~ "動詞B",
          (POS1 == "動詞" & POS2 == "自立") ~ "動詞",
          (POS1 == "形容詞" & stringr::str_detect(token, "^[\\p{Hiragana}]+$")) ~ "形容詞B",
          (POS1 == "形容詞" & POS2 == "非自立") ~ "形容詞（非自立）",
          (POS1 == "形容詞") ~ "形容詞",
          (POS1 == "副詞" & stringr::str_detect(token, "^[\\p{Hiragana}]+$")) ~ "副詞B",
          (POS1 == "副詞") ~ "副詞",
          (POS1 == "助動詞" & Original %in% c("ない", "まい", "ぬ", "ん")) ~ "否定助動詞",
          .default = "その他"
        )
      ) |>
      dplyr::filter(
        pos %in% c(
          "名詞",
          "地名", "人名", "組織名", "固有名詞",
          "動詞", "未知語"
        )
      ) |>
      dplyr::mutate(
        doc_id = droplevels(doc_id),
        token = dplyr::if_else(is.na(Original), token, Original),
        token = paste(token, pos, sep = "/")
      ) |>
      gibasa::pack()
  }) |>
  dplyr::ungroup() |>
  dplyr::left_join(dplyr::select(tbl, doc_id, category), by = "doc_id")

モデルの学習

データを分割します。

corp_split <- rsample::initial_split(corp, prop = .8, strata = "category")
corp_train <- rsample::training(corp_split)
corp_test <- rsample::testing(corp_split)

以下のレシピとモデルで学習します。ここでは、ハッシュトリックを使っています。

なお、tidymodelsの枠組みの外であらかじめ分かち書きを済ませましたが、textrecipes::step_tokenizeのcustom_token引数に独自にトークナイザを指定することで、一つのstepとして分かち書きすることもできます。

NUM_TERMS <- 100L

corp_spec <-
  parsnip::boost_tree(
    trees = !!NUM_TERMS, # model_specに外にある変数を与える場合には、このようにinjectionします
    tree_depth = tune::tune(),
    mtry = tune::tune(),
    min_n = 5,
    learn_rate = .3,
    stop_iter = 5 # 例なので小さな値にしています
  ) |>
  parsnip::set_engine(
    "xgboost",
    nthread = !!max(1, parallel::detectCores() - 1, na.rm = TRUE)
  ) |>
  parsnip::set_mode("classification")

corp_rec <-
  recipes::recipe(
    category ~ text,
    data = corp_train
  ) |>
  textrecipes::step_tokenize(
    text,
    custom_token = \(x) strsplit(x, " +")
  ) |>
  textrecipes::step_tokenfilter(
    text,
    max_times = nrow(corp_train),
    max_tokens = NUM_TERMS * 5
  ) |>
  textrecipes::step_texthash(text, num_terms = NUM_TERMS)

corp_wflow <-
  workflows::workflow() |>
  workflows::add_model(corp_spec) |>
  workflows::add_recipe(corp_rec)

F値をメトリクスにして学習します。5分割CVで、簡単にですが、ハイパーパラメータ探索をします。

corp_tune_res <-
  corp_wflow |>
  tune::tune_grid(
    resamples = rsample::vfold_cv(corp_train, strata = category, v = 5L),
    grid = dials::grid_space_filling(
      dials::tree_depth(),
      dials::mtry(range = c(30L, NUM_TERMS)),
      size = 10L
    ),
    metrics = yardstick::metric_set(yardstick::f_meas),
    control = tune::control_grid(save_pred = TRUE)
  )

ハイパラ探索の要約を確認します。

ggplot2::autoplot(corp_tune_res)

fitします。

corp_wflow <-
  tune::finalize_workflow(corp_wflow, tune::select_best(corp_tune_res, metric = "f_meas"))

corp_fit <- tune::last_fit(corp_wflow, corp_split)

学習したモデルの精度を見てみます。

corp_fit |>
  tune::collect_predictions() |>
  yardstick::f_meas(truth = category, estimate = .pred_class)
#> # A tibble: 1 × 3
#>   .metric .estimator .estimate
#>   <chr>   <chr>          <dbl>
#> 1 f_meas  macro          0.841

セッション情報

sessioninfo::session_info()
#> ─ Session info ───────────────────────────────────────────────────────────────
#>  setting  value
#>  version  R version 4.4.2 (2024-10-31)
#>  os       Ubuntu 24.04.1 LTS
#>  system   x86_64, linux-gnu
#>  ui       X11
#>  language en
#>  collate  C.UTF-8
#>  ctype    C.UTF-8
#>  tz       UTC
#>  date     2025-02-16
#>  pandoc   3.1.11 @ /opt/hostedtoolcache/pandoc/3.1.11/x64/ (via rmarkdown)
#>  quarto   NA
#> 
#> ─ Packages ───────────────────────────────────────────────────────────────────
#>  package      * version    date (UTC) lib source
#>  backports      1.5.0      2024-05-23 [2] RSPM
#>  bit            4.5.0.1    2024-12-03 [2] RSPM
#>  bit64          4.6.0-1    2025-01-16 [2] RSPM
#>  broom        * 1.0.7      2024-09-26 [2] RSPM
#>  bslib          0.9.0      2025-01-30 [2] RSPM
#>  cachem         1.1.0      2024-05-16 [2] RSPM
#>  class          7.3-22     2023-05-03 [4] CRAN (R 4.4.2)
#>  cli            3.6.4      2025-02-13 [2] RSPM
#>  codetools      0.2-20     2024-03-31 [4] CRAN (R 4.4.2)
#>  colorspace     2.1-1      2024-07-26 [2] RSPM
#>  conflicted     1.2.0      2023-02-01 [2] RSPM
#>  crayon         1.5.3      2024-06-20 [2] RSPM
#>  data.table     1.16.4     2024-12-06 [2] RSPM
#>  desc           1.4.3      2023-12-10 [2] RSPM
#>  dials        * 1.4.0      2025-02-13 [2] RSPM
#>  DiceDesign     1.10       2023-12-07 [2] RSPM
#>  digest         0.6.37     2024-08-19 [2] RSPM
#>  dplyr        * 1.1.4      2023-11-17 [2] RSPM
#>  evaluate       1.0.3      2025-01-10 [2] RSPM
#>  farver         2.1.2      2024-05-13 [2] RSPM
#>  fastmap        1.2.0      2024-05-15 [2] RSPM
#>  float          0.3-2      2023-12-10 [2] RSPM
#>  foreach        1.5.2      2022-02-02 [2] RSPM
#>  fs             1.6.5      2024-10-30 [2] RSPM
#>  furrr          0.3.1      2022-08-15 [2] RSPM
#>  future         1.34.0     2024-07-29 [2] RSPM
#>  future.apply   1.11.3     2024-10-27 [2] RSPM
#>  generics       0.1.3      2022-07-05 [2] RSPM
#>  ggplot2      * 3.5.1      2024-04-23 [2] RSPM
#>  gibasa         1.1.2      2025-02-16 [1] local
#>  globals        0.16.3     2024-03-08 [2] RSPM
#>  glue           1.8.0      2024-09-30 [2] RSPM
#>  gower          1.0.2      2024-12-17 [2] RSPM
#>  GPfit          1.0-8      2019-02-08 [2] RSPM
#>  gtable         0.3.6      2024-10-25 [2] RSPM
#>  hardhat        1.4.1      2025-01-31 [2] RSPM
#>  hms            1.1.3      2023-03-21 [2] RSPM
#>  htmltools      0.5.8.1    2024-04-04 [2] RSPM
#>  htmlwidgets    1.6.4      2023-12-06 [2] RSPM
#>  infer        * 1.0.7      2024-03-25 [2] RSPM
#>  ipred          0.9-15     2024-07-18 [2] RSPM
#>  iterators      1.0.14     2022-02-05 [2] RSPM
#>  jquerylib      0.1.4      2021-04-26 [2] RSPM
#>  jsonlite       1.8.9      2024-09-20 [2] RSPM
#>  knitr          1.49       2024-11-08 [2] RSPM
#>  labeling       0.4.3      2023-08-29 [2] RSPM
#>  lattice        0.22-6     2024-03-20 [4] CRAN (R 4.4.2)
#>  lava           1.8.1      2025-01-12 [2] RSPM
#>  ldccr          2025.02.02 2025-02-16 [2] Github (paithiov909/ldccr@82bcbc7)
#>  lgr            0.4.4      2022-09-05 [2] RSPM
#>  lhs            1.2.0      2024-06-30 [2] RSPM
#>  lifecycle      1.0.4      2023-11-07 [2] RSPM
#>  listenv        0.9.1      2024-01-29 [2] RSPM
#>  lubridate      1.9.4      2024-12-08 [2] RSPM
#>  magrittr       2.0.3      2022-03-30 [2] RSPM
#>  MASS           7.3-61     2024-06-13 [4] CRAN (R 4.4.2)
#>  Matrix         1.7-1      2024-10-18 [4] CRAN (R 4.4.2)
#>  memoise        2.0.1      2021-11-26 [2] RSPM
#>  mlapi          0.1.1      2022-04-24 [2] RSPM
#>  modeldata    * 1.4.0      2024-06-19 [2] RSPM
#>  munsell        0.5.1      2024-04-01 [2] RSPM
#>  nnet           7.3-19     2023-05-03 [4] CRAN (R 4.4.2)
#>  parallelly     1.42.0     2025-01-30 [2] RSPM
#>  parsnip      * 1.3.0      2025-02-14 [2] RSPM
#>  pillar         1.10.1     2025-01-07 [2] RSPM
#>  pkgconfig      2.0.3      2019-09-22 [2] RSPM
#>  pkgdown        2.1.1      2024-09-17 [2] any (@2.1.1)
#>  prodlim        2024.06.25 2024-06-24 [2] RSPM
#>  purrr        * 1.0.4      2025-02-05 [2] RSPM
#>  R.cache        0.16.0     2022-07-21 [2] RSPM
#>  R.methodsS3    1.8.2      2022-06-13 [2] RSPM
#>  R.oo           1.27.0     2024-11-01 [2] RSPM
#>  R.utils        2.12.3     2023-11-18 [2] RSPM
#>  R6             2.6.1      2025-02-15 [2] RSPM
#>  ragg           1.3.3      2024-09-11 [2] RSPM
#>  Rcpp           1.0.14     2025-01-12 [2] RSPM
#>  RcppParallel   5.1.10     2025-01-24 [2] RSPM
#>  readr          2.1.5      2024-01-10 [2] RSPM
#>  recipes      * 1.1.1      2025-02-12 [2] RSPM
#>  RhpcBLASctl    0.23-42    2023-02-11 [2] RSPM
#>  rlang          1.1.5      2025-01-17 [2] RSPM
#>  rmarkdown      2.29       2024-11-04 [2] RSPM
#>  rpart          4.1.23     2023-12-05 [4] CRAN (R 4.4.2)
#>  rsample      * 1.2.1      2024-03-25 [2] RSPM
#>  rsparse        0.5.2      2024-06-28 [2] RSPM
#>  rstudioapi     0.17.1     2024-10-22 [2] RSPM
#>  sass           0.4.9      2024-03-15 [2] RSPM
#>  scales       * 1.3.0      2023-11-28 [2] RSPM
#>  sessioninfo    1.2.3      2025-02-05 [2] any (@1.2.3)
#>  sfd            0.1.0      2024-01-08 [2] RSPM
#>  sparsevctrs    0.2.0      2025-01-22 [2] RSPM
#>  stringi        1.8.4      2024-05-06 [2] RSPM
#>  stringr        1.5.1      2023-11-14 [2] RSPM
#>  styler         1.10.3     2024-04-07 [2] any (@1.10.3)
#>  survival       3.7-0      2024-06-05 [4] CRAN (R 4.4.2)
#>  systemfonts    1.2.1      2025-01-20 [2] RSPM
#>  text2vec     * 0.6.4      2023-11-09 [2] RSPM
#>  textrecipes  * 1.0.7      2025-01-24 [2] RSPM
#>  textshaping    1.0.0      2025-01-20 [2] RSPM
#>  tibble       * 3.2.1      2023-03-20 [2] RSPM
#>  tidymodels   * 1.2.0      2024-03-25 [2] RSPM
#>  tidyr        * 1.3.1      2024-01-24 [2] RSPM
#>  tidyselect     1.2.1      2024-03-11 [2] RSPM
#>  timechange     0.3.0      2024-01-18 [2] RSPM
#>  timeDate       4041.110   2024-09-22 [2] RSPM
#>  tune         * 1.2.1      2024-04-18 [2] RSPM
#>  tzdb           0.4.0      2023-05-12 [2] RSPM
#>  utf8           1.2.4      2023-10-22 [2] RSPM
#>  vctrs          0.6.5      2023-12-01 [2] RSPM
#>  vroom          1.6.5      2023-12-05 [2] RSPM
#>  withr          3.0.2      2024-10-28 [2] RSPM
#>  workflows    * 1.1.4      2024-02-19 [2] RSPM
#>  workflowsets * 1.1.0      2024-03-21 [2] RSPM
#>  xfun           0.50       2025-01-07 [2] RSPM
#>  xgboost      * 1.7.8.1    2024-07-24 [2] RSPM
#>  yaml           2.3.10     2024-07-26 [2] RSPM
#>  yardstick    * 1.3.2      2025-01-22 [2] RSPM
#> 
#>  [1] /tmp/RtmpEo2O8w/temp_libpath3aff17ad95c7
#>  [2] /home/runner/work/_temp/Library
#>  [3] /opt/R/4.4.2/lib/R/site-library
#>  [4] /opt/R/4.4.2/lib/R/library
#>  * ── Packages attached to the search path.
#> 
#> ──────────────────────────────────────────────────────────────────────────────