September 28, 2018
bit.ly/0928pttR
減輕抓取 PTT 資料的負擔
符合 PTT 的斷詞處理
與 R Text Mining 套件銜接
devtools::install_github("liao961120/pttR", ref = "build")
library(dplyr) # 資料抓取 idx_df <- pttR::index2df("gossiping", newest = 1) pst_df <- idx_df$link[1:5] %>% pttR::as_url() %>% pttR::post2df() # 斷詞 pst_df_segged <- pst_df %>% mutate(content = pttR::seg_content(content), comment = pttR::seg_comment(comment)) # 第一篇文章的留言 pst_df_segged$comment[[1]] # Construct Corpus Object post_qcorp <- pttR::post2qcorp(pst_df_segged) # Corpus object cmt_qcorp <- pttR::comment2qcorp(pst_df_segged) # Corpus list-col in df
index2df()
\(\equiv\) www.ptt.cc/bbs/看板名稱/index.html post2df()
\(\equiv\) www.ptt.cc/bbs/看板名稱/xx..xx.htmlDemo
dplyr::mutate()
+ pttR::seg_content()
+ pttR::seg_comment()
(Demo)