Skip to contents

データの準備

livedoorニュースコーパスを使います。このコーパスのカテゴリ分類はかなり易しいタスクであることが知られている(というか、一部のカテゴリではそのカテゴリを同定できる単語が本文に含まれてしまっている)ので、機械学習を手軽に試すのに便利です。テキストの特徴量をもとに以下の9カテゴリの分類をします。

  • トピックニュース
  • Sports Watch
  • ITライフハック
  • 家電チャンネル
  • MOVIE ENTER
  • 独女通信
  • エスマックス
  • livedoor HOMME
  • Peachy

ldccrでデータフレームにします。

tbl <- ldccr::read_ldnws() |>
  dplyr::mutate(doc_id = as.character(dplyr::row_number()))
#> Parsing dokujo-tsushin...
#> Parsing it-life-hack...
#> Parsing kaden-channel...
#> Parsing livedoor-homme...
#> Parsing movie-enter...
#> Parsing peachy...
#> Parsing smax...
#> Parsing sports-watch...
#> Parsing topic-news...
#> Done.

ここでは、KH Coderの品詞体系における名詞・地名・人名・組織名・固有名詞・動詞・未知語を抽出し、IPA辞書に収録されている語については原形にしながら分かち書きにします。

corp <- tbl |>
  dplyr::mutate(
    text = stringi::stri_trans_nfkc(body) |>
      stringi::stri_replace_all_regex("(https?\\://[[:alnum:]\\.\\-_/]+)", "\nURL\tタグ\n") |>
      stringi::stri_replace_all_regex("[\\s]{2,}", "\n") |>
      stringi::stri_trim_both(),
    chunk = dplyr::ntile(dplyr::row_number(), 10)
  ) |>
  dplyr::group_by(chunk) |>
  dplyr::group_modify(\(df, idx) {
    data.frame(
      doc_id = df$doc_id,
      text = df$text
    ) |>
      gibasa::tokenize(text, partial = TRUE) |>
      gibasa::prettify(
        col_select = c("POS1", "POS2", "POS3", "Original")
      ) |>
      dplyr::mutate(
        pos = dplyr::case_when(
          (POS1 == "タグ") ~ "タグ",
          (is.na(Original) & stringr::str_detect(token, "^[[:alpha:]]+$")) ~ "未知語",
          (POS1 == "感動詞") ~ "感動詞",
          (POS1 == "名詞" & POS2 == "一般" & stringr::str_detect(token, "^[\\p{Han}]{1}$")) ~ "名詞C",
          (POS1 == "名詞" & POS2 == "一般" & stringr::str_detect(token, "^[\\p{Hiragana}]+$")) ~ "名詞B",
          (POS1 == "名詞" & POS2 == "一般") ~ "名詞",
          (POS1 == "名詞" & POS2 == "固有名詞" & POS3 == "地域") ~ "地名",
          (POS1 == "名詞" & POS2 == "固有名詞" & POS3 == "人名") ~ "人名",
          (POS1 == "名詞" & POS2 == "固有名詞" & POS3 == "組織") ~ "組織名",
          (POS1 == "名詞" & POS2 == "形容動詞語幹") ~ "形容動詞",
          (POS1 == "名詞" & POS2 == "ナイ形容詞語幹") ~ "ナイ形容詞",
          (POS1 == "名詞" & POS2 == "固有名詞") ~ "固有名詞",
          (POS1 == "名詞" & POS2 == "サ変接続") ~ "サ変名詞",
          (POS1 == "名詞" & POS2 == "副詞可能") ~ "副詞可能",
          (POS1 == "動詞" & POS2 == "自立" & stringr::str_detect(token, "^[\\p{Hiragana}]+$")) ~ "動詞B",
          (POS1 == "動詞" & POS2 == "自立") ~ "動詞",
          (POS1 == "形容詞" & stringr::str_detect(token, "^[\\p{Hiragana}]+$")) ~ "形容詞B",
          (POS1 == "形容詞" & POS2 == "非自立") ~ "形容詞(非自立)",
          (POS1 == "形容詞") ~ "形容詞",
          (POS1 == "副詞" & stringr::str_detect(token, "^[\\p{Hiragana}]+$")) ~ "副詞B",
          (POS1 == "副詞") ~ "副詞",
          (POS1 == "助動詞" & Original %in% c("ない", "まい", "ぬ", "ん")) ~ "否定助動詞",
          .default = "その他"
        )
      ) |>
      dplyr::filter(
        pos %in% c(
          "名詞",
          "地名", "人名", "組織名", "固有名詞",
          "動詞", "未知語"
        )
      ) |>
      dplyr::mutate(
        doc_id = droplevels(doc_id),
        token = dplyr::if_else(is.na(Original), token, Original),
        token = paste(token, pos, sep = "/")
      ) |>
      gibasa::pack()
  }) |>
  dplyr::ungroup() |>
  dplyr::left_join(dplyr::select(tbl, doc_id, category), by = "doc_id")

モデルの学習

データを分割します。

corp_split <- rsample::initial_split(corp, prop = .8, strata = "category")
corp_train <- rsample::training(corp_split)
corp_test <- rsample::testing(corp_split)

以下のレシピとモデルで学習します。ここでは、ハッシュトリックを使っています。

なお、tidymodelsの枠組みの外であらかじめ分かち書きを済ませましたが、textrecipes::step_tokenizecustom_token引数に独自にトークナイザを指定することで、一つのstepとして分かち書きすることもできます。

NUM_TERMS <- 100L

corp_spec <-
  parsnip::boost_tree(
    trees = !!NUM_TERMS, # model_specに外にある変数を与える場合には、このようにinjectionします
    tree_depth = tune::tune(),
    mtry = tune::tune(),
    min_n = 5,
    learn_rate = .3,
    stop_iter = 5 # 例なので小さな値にしています
  ) |>
  parsnip::set_engine(
    "xgboost",
    nthread = !!max(1, parallel::detectCores() - 1, na.rm = TRUE)
  ) |>
  parsnip::set_mode("classification")

corp_rec <-
  recipes::recipe(
    category ~ text,
    data = corp_train
  ) |>
  textrecipes::step_tokenize(
    text,
    custom_token = \(x) strsplit(x, " +")
  ) |>
  textrecipes::step_tokenfilter(
    text,
    max_times = nrow(corp_train),
    max_tokens = NUM_TERMS * 5
  ) |>
  textrecipes::step_texthash(text, num_terms = NUM_TERMS)
corp_wflow <-
  workflows::workflow() |>
  workflows::add_model(corp_spec) |>
  workflows::add_recipe(corp_rec)

F値をメトリクスにして学習します。5分割CVで、簡単にですが、ハイパーパラメータ探索をします。

corp_tune_res <-
  corp_wflow |>
  tune::tune_grid(
    resamples = rsample::vfold_cv(corp_train, strata = category, v = 5L),
    grid = dials::grid_latin_hypercube(
      dials::tree_depth(),
      dials::mtry(range = c(30L, NUM_TERMS)),
      size = 10L
    ),
    metrics = yardstick::metric_set(yardstick::f_meas),
    control = tune::control_grid(save_pred = TRUE)
  )

ハイパラ探索の要約を確認します。

ggplot2::autoplot(corp_tune_res)

fitします。

corp_wflow <-
  tune::finalize_workflow(corp_wflow, tune::select_best(corp_tune_res, metric = "f_meas"))

corp_fit <- tune::last_fit(corp_wflow, corp_split)

学習したモデルの精度を見てみます。

corp_fit |>
  tune::collect_predictions() |>
  yardstick::f_meas(truth = category, estimate = .pred_class)
#> # A tibble: 1 × 3
#>   .metric .estimator .estimate
#>   <chr>   <chr>          <dbl>
#> 1 f_meas  macro          0.844

セッション情報

sessioninfo::session_info()
#> ─ Session info ───────────────────────────────────────────────────────────────
#>  setting  value
#>  version  R version 4.3.3 (2024-02-29)
#>  os       Ubuntu 22.04.4 LTS
#>  system   x86_64, linux-gnu
#>  ui       X11
#>  language en
#>  collate  C.UTF-8
#>  ctype    C.UTF-8
#>  tz       UTC
#>  date     2024-03-23
#>  pandoc   3.1.11 @ /opt/hostedtoolcache/pandoc/3.1.11/x64/ (via rmarkdown)
#> 
#> ─ Packages ───────────────────────────────────────────────────────────────────
#>  package      * version    date (UTC) lib source
#>  backports      1.4.1      2021-12-13 [2] RSPM
#>  bit            4.0.5      2022-11-15 [2] RSPM
#>  bit64          4.0.5      2020-08-30 [2] RSPM
#>  broom        * 1.0.5      2023-06-09 [2] RSPM
#>  bslib          0.6.1      2023-11-28 [2] RSPM
#>  cachem         1.0.8      2023-05-01 [2] RSPM
#>  class          7.3-22     2023-05-03 [4] CRAN (R 4.3.3)
#>  cli            3.6.2      2023-12-11 [2] RSPM
#>  codetools      0.2-19     2023-02-01 [4] CRAN (R 4.3.3)
#>  colorspace     2.1-0      2023-01-23 [2] RSPM
#>  conflicted     1.2.0      2023-02-01 [2] RSPM
#>  crayon         1.5.2      2022-09-29 [2] RSPM
#>  data.table     1.15.2     2024-02-29 [2] RSPM
#>  desc           1.4.3      2023-12-10 [2] RSPM
#>  dials        * 1.2.1      2024-02-22 [2] RSPM
#>  DiceDesign     1.10       2023-12-07 [2] RSPM
#>  digest         0.6.35     2024-03-11 [2] RSPM
#>  dplyr        * 1.1.4      2023-11-17 [2] RSPM
#>  ellipsis       0.3.2      2021-04-29 [2] RSPM
#>  evaluate       0.23       2023-11-01 [2] RSPM
#>  fansi          1.0.6      2023-12-08 [2] RSPM
#>  farver         2.1.1      2022-07-06 [2] RSPM
#>  fastmap        1.1.1      2023-02-24 [2] RSPM
#>  float          0.3-2      2023-12-10 [2] RSPM
#>  foreach        1.5.2      2022-02-02 [2] RSPM
#>  fs             1.6.3      2023-07-20 [2] RSPM
#>  furrr          0.3.1      2022-08-15 [2] RSPM
#>  future         1.33.1     2023-12-22 [2] RSPM
#>  future.apply   1.11.1     2023-12-21 [2] RSPM
#>  generics       0.1.3      2022-07-05 [2] RSPM
#>  ggplot2      * 3.5.0      2024-02-23 [2] RSPM
#>  gibasa         1.1.0      2024-03-23 [1] local
#>  globals        0.16.3     2024-03-08 [2] RSPM
#>  glue           1.7.0      2024-01-09 [2] RSPM
#>  gower          1.0.1      2022-12-22 [2] RSPM
#>  GPfit          1.0-8      2019-02-08 [2] RSPM
#>  gtable         0.3.4      2023-08-21 [2] RSPM
#>  hardhat        1.3.1      2024-02-02 [2] RSPM
#>  highr          0.10       2022-12-22 [2] RSPM
#>  hms            1.1.3      2023-03-21 [2] RSPM
#>  htmltools      0.5.7      2023-11-03 [2] RSPM
#>  infer        * 1.0.6      2024-01-31 [2] RSPM
#>  ipred          0.9-14     2023-03-09 [2] RSPM
#>  iterators      1.0.14     2022-02-05 [2] RSPM
#>  jquerylib      0.1.4      2021-04-26 [2] RSPM
#>  jsonlite       1.8.8      2023-12-04 [2] RSPM
#>  knitr          1.45       2023-10-30 [2] RSPM
#>  labeling       0.4.3      2023-08-29 [2] RSPM
#>  lattice        0.22-5     2023-10-24 [4] CRAN (R 4.3.3)
#>  lava           1.8.0      2024-03-05 [2] RSPM
#>  ldccr          2024.02.04 2024-03-22 [2] Github (paithiov909/ldccr@0f566b0)
#>  lgr            0.4.4      2022-09-05 [2] RSPM
#>  lhs            1.1.6      2022-12-17 [2] RSPM
#>  lifecycle      1.0.4      2023-11-07 [2] RSPM
#>  listenv        0.9.1      2024-01-29 [2] RSPM
#>  lubridate      1.9.3      2023-09-27 [2] RSPM
#>  magrittr       2.0.3      2022-03-30 [2] RSPM
#>  MASS           7.3-60.0.1 2024-01-13 [4] CRAN (R 4.3.3)
#>  Matrix         1.6-5      2024-01-11 [4] CRAN (R 4.3.3)
#>  memoise        2.0.1      2021-11-26 [2] RSPM
#>  mlapi          0.1.1      2022-04-24 [2] RSPM
#>  modeldata    * 1.3.0      2024-01-21 [2] RSPM
#>  munsell        0.5.0      2018-06-12 [2] RSPM
#>  nnet           7.3-19     2023-05-03 [4] CRAN (R 4.3.3)
#>  parallelly     1.37.1     2024-02-29 [2] RSPM
#>  parsnip      * 1.2.0      2024-02-16 [2] RSPM
#>  pillar         1.9.0      2023-03-22 [2] RSPM
#>  pkgconfig      2.0.3      2019-09-22 [2] RSPM
#>  pkgdown        2.0.7      2022-12-14 [2] any (@2.0.7)
#>  prodlim        2023.08.28 2023-08-28 [2] RSPM
#>  purrr        * 1.0.2      2023-08-10 [2] RSPM
#>  R.cache        0.16.0     2022-07-21 [2] RSPM
#>  R.methodsS3    1.8.2      2022-06-13 [2] RSPM
#>  R.oo           1.26.0     2024-01-24 [2] RSPM
#>  R.utils        2.12.3     2023-11-18 [2] RSPM
#>  R6             2.5.1      2021-08-19 [2] RSPM
#>  ragg           1.3.0      2024-03-13 [2] RSPM
#>  Rcpp           1.0.12     2024-01-09 [2] RSPM
#>  RcppParallel   5.1.7      2023-02-27 [2] RSPM
#>  readr          2.1.5      2024-01-10 [2] RSPM
#>  recipes      * 1.0.10     2024-02-18 [2] RSPM
#>  RhpcBLASctl    0.23-42    2023-02-11 [2] RSPM
#>  rlang          1.1.3      2024-01-10 [2] RSPM
#>  rmarkdown      2.26       2024-03-05 [2] RSPM
#>  rpart          4.1.23     2023-12-05 [4] CRAN (R 4.3.3)
#>  rsample      * 1.2.0      2023-08-23 [2] RSPM
#>  rsparse        0.5.1      2022-09-11 [2] RSPM
#>  rstudioapi     0.15.0     2023-07-07 [2] RSPM
#>  sass           0.4.9      2024-03-15 [2] RSPM
#>  scales       * 1.3.0      2023-11-28 [2] RSPM
#>  sessioninfo    1.2.2      2021-12-06 [2] any (@1.2.2)
#>  stringi        1.8.3      2023-12-11 [2] RSPM
#>  stringr        1.5.1      2023-11-14 [2] RSPM
#>  styler         1.10.2     2023-08-29 [2] any (@1.10.2)
#>  survival       3.5-8      2024-02-14 [4] CRAN (R 4.3.3)
#>  systemfonts    1.0.6      2024-03-07 [2] RSPM
#>  text2vec     * 0.6.4      2023-11-09 [2] RSPM
#>  textrecipes  * 1.0.6      2023-11-15 [2] RSPM
#>  textshaping    0.3.7      2023-10-09 [2] RSPM
#>  tibble       * 3.2.1      2023-03-20 [2] RSPM
#>  tidymodels   * 1.1.1      2023-08-24 [2] RSPM
#>  tidyr        * 1.3.1      2024-01-24 [2] RSPM
#>  tidyselect     1.2.1      2024-03-11 [2] RSPM
#>  timechange     0.3.0      2024-01-18 [2] RSPM
#>  timeDate       4032.109   2023-12-14 [2] RSPM
#>  tune         * 1.2.0      2024-03-20 [2] RSPM
#>  tzdb           0.4.0      2023-05-12 [2] RSPM
#>  utf8           1.2.4      2023-10-22 [2] RSPM
#>  vctrs          0.6.5      2023-12-01 [2] RSPM
#>  vroom          1.6.5      2023-12-05 [2] RSPM
#>  withr          3.0.0      2024-01-16 [2] RSPM
#>  workflows    * 1.1.4      2024-02-19 [2] RSPM
#>  workflowsets * 1.1.0      2024-03-21 [2] RSPM
#>  xfun           0.42       2024-02-08 [2] RSPM
#>  xgboost      * 1.7.7.1    2024-01-25 [2] RSPM
#>  yaml           2.3.8      2023-12-11 [2] RSPM
#>  yardstick    * 1.3.1      2024-03-21 [2] RSPM
#> 
#>  [1] /tmp/RtmpheSglV/temp_libpath36517838a50d
#>  [2] /home/runner/work/_temp/Library
#>  [3] /opt/R/4.3.3/lib/R/site-library
#>  [4] /opt/R/4.3.3/lib/R/library
#> 
#> ──────────────────────────────────────────────────────────────────────────────