본문 바로가기
공부/R Programming

R 기초; 웹 스크레이핑 stringr 패키지

by 혼밥맨 2021. 1. 15.
반응형

R 기초; 웹 스크레이핑 stringr 패키지

 

string <- c("data analytics is useful", "business analytics is helpful", "visualization of data is interesting for data scientists")

 

install.packages("stringr", repose="http://cran.us.r-project.org')

library(stringr)

 

str_detect(string=string, pattern="data")

[1]  TRUE FALSE  TRUE

 

str_detect(string, "DATA")

[1] FALSE FALSE FALSE

 

str_detect(string, fixed("DATA", ignore_case=TRUE))

[1]  TRUE FALSE  TRUE

 

str_detect(c("abz", "ayz", "a.z"), "a.z")

[1] TRUE TRUE TRUE

 

str_detect(c("abz", "ayz", "a.z"), fixed("a.z"))

[1] FALSE FALSE TRUE

 

str_detect(c("abz", "ayz", "a.z"), "a\\.z")

[1] FALSE FALSE TRUE

 

str_locate()

str_locate_all()

 

regexpr()

gregexpr()

 

str_locate(string, "data")

     start end
[1,]     1   4
[2,]    NA  NA
[3,]    18  21

 

str_locate_all(string, "data")

[[1]]
     start end
[1,]     1   4

[[2]]
     start end

[[3]]
     start end
[1,]    18  21
[2,]    42  45

 

str_extract_all(string, "data", simplify=TRUE)

     [,1]   [,2]  
[1,] "data" ""    
[2,] ""     ""    
[3,] "data" "data"

 

unlist(str_extract_all(string, "data"))

[1] "data" "data" "data"

 

str_match()

str_match_all()

 

sentences5 <- sentences[1:5]

sentences5

[1] "The birch canoe slid on the smooth planks."  "Glue the sheet to the dark blue background." "It's easy to tell the depth of a well."     
[4] "These days a chicken leg is a rare dish."    "Rice is often served in round bowls."

 

str_extract(sentences5, "(a|A|the|The) (\\w+)")

[1] "The birch" "the sheet" "the depth" "a chicken" NA 

 

str_replace_all(string=string, pattern="data", replacement="text')

[1] "text analytics is useful"                                 "business analytics is helpful"                           
[3] "visualization of text is interesting for text scientists"

 

str_split(string, " ")

[[1]]
[1] "data"      "analytics" "is"        "useful"   

[[2]]
[1] "business"  "analytics" "is"        "helpful"  

[[3]]
[1] "visualization" "of"            "data"          "is"            "interesting"   "for"           "data"          "scientists"   

 

unlist(str_split(string, " "))

[1] "data"          "analytics"     "is"            "useful"        "business"      "analytics"     "is"            "helpful"       "visualization"
[10] "of"            "data"          "is"            "interesting"   "for"           "data"          "scientists"   

 

unique(unlist(str_split(string, " ")))

 [1] "data"          "analytics"     "is"            "useful"        "business"      "helpful"       "visualization" "of"            "interesting"  
[10] "for"           "scientists"   

 

str_split(string, " ", n=3)

[[1]]
[1] "data"      "analytics" "is useful"

[[2]]
[1] "business"   "analytics"  "is helpful"

[[3]]
[1] "visualization"                           "of"                                      "data is interesting for data scientists"

 

 

str_split(string, " ", n=3, simplify=TRUE)

     [,1]            [,2]        [,3]                                     
[1,] "data"          "analytics" "is useful"                              
[2,] "business"      "analytics" "is helpful"                             
[3,] "visualization" "of"        "data is interesting for data scientists"

 

 

str_length(string)

[1] 24 29 56

str_count(string, "data")

[1] 1 0 2

str_count(string, "\\w+")

[1] 4 4 8

 

str_pad(string=c("a", "abc", "abcde"), width=6, side="left", pad=" ")

[1] "     a" "   abc" " abcde"

mon <- 1:12

mon

 [1]  1  2  3  4  5  6  7  8  9 10 11 12

str_pad(mon, width=2, side="left", pad="0")

[1] "01" "02" "03" "04" "05" "06" "07" "08" "09" "10" "11" "12"

 

 

str_trim()

str.pad <- str_pad(string, width=max(str_length(string)), side="both", pad=" ")

str.pad

[1] "                data analytics is useful                " "             business analytics is helpful              "
[3] "visualization of data is interesting for data scientists"

 

str_trim(str.pad, side="both")

[1] "data analytics is useful"                                 "business analytics is helpful"                           
[3] "visualization of data is interesting for data scientists"

 

str_c("data", "mining", sep=" ")

[1] "data mining"

 

str.mining <- str_c(c("data mining", "text mining"), "is useful", sep=" ")

 

str.mining

[1] "data mining is useful" "text mining is useful"

str_c(str.mining, collapse="; ")

[1] "data mining is useful; text mining is useful"

 

str_c(str.mining, collapse="\n")

[1] "data mining is useful\ntext mining is useful"

 

cat(str_c(str.mining, collapse="\n"))

data mining is useful
text mining is useful

 

str_sub(string=str.mining, start=1, end=4)

[1] "data" "text"

 

str_sub(str.mining, 5, 5) <- "-"

str.mining

[1] "data-mining is useful" "text-mining is useful"

 

 

str_sub("abcedfg", start=-2)

[1] "fg"

str_sub("abcedfg", end=-3)

[1] "abced"

반응형

'공부 > R Programming' 카테고리의 다른 글

R 기초; 입력  (0) 2021.01.15
R 기초; 날짜와 시간  (0) 2021.01.15
R 기초; 웹스크레이핑 base 패키지  (0) 2021.01.14
R 기초; 텍스트  (0) 2021.01.13
R 기초; 데이터프레임 인덱싱-3  (0) 2021.01.10

댓글