본문 바로가기
공부/R Programming

R 기초; 텍스트

by 혼밥맨 2021. 1. 13.
반응형

R 기초; 텍스트

 

x <- "We have a dream"

nchar(x)    # 문자열의 캐릭터 개수

[1] 15

 

length(x)    # 원소의 개수

[1] 1

 

y <-c ("we", "have", "a", "dream")

y

[1] "we" "have" "a" "dream"

 

nchar(y)

[1] 2 4 1 5

 

length(y)

[1] 4

 

nchar(y[4])

[1] 5

 

letters

sort(letters, decreasing = TRUE)

[1] "z" "y" "x" "w" "v" "u" "t" "s" "r" "q" "p" "o" "n" "m" "l" "k" "j"
[18] "i" "h" "g" "f" "e" "d" "c" "b" "a"

 

fox.says <- "It is only with the HEART that one can See Rightly"

tolower(fox.says)

[1] "it is only with the heart that one can see rightly"

toupper(fox.says)

[1] "IT IS ONLY WITH THE HEART THAT ONE CAN SEE RIGHTLY"

 

fox.said <- "what is essential is invisible to the eye"

strsplit(fox.said, split=" ")

[[1]]
[1] "what"      "is"        "essential" "is"        "invisible"
[6] "to"        "the"       "eye" 

strsplit(fox.said, split="")

[[1]]
 [1] "w" "h" "a" "t" " " "i" "s" " " "e" "s" "s" "e" "n" "t" "i" "a" "l"
[18] " " "i" "s" " " "i" "n" "v" "i" "s" "i" "b" "l" "e" " " "t" "o" " "
[35] "t" "h" "e" " " "e" "y" "e"

 

fox.said.words <- unlist(strsplit(fox.said, split=" "))

fox.said.words

[1] "what"      "is"        "essential" "is"        "invisible"
[6] "to"        "the"       "eye" 

fox.said.words[3]

[1] "essential"

 

strsplit(fox.said, split=" ")[[1]]

[1] "what"      "is"        "essential" "is"        "invisible"
[6] "to"        "the"       "eye" 

 

strsplit(fox.said, split=" ")[[1]][[3]]

[1] "essential"

 

p1 <- "You come at four in the afternoon, then at three I shall begin to be happy"

p2 <- "One runs the risk of weeping a little, if one lets himself be tamed"

p3 <- "what makes the desert beautiful is that somewhere it hides a well"

 

littleprince <- c(p1, p2, p3)

littleprince

strsplit(littleprince, " ")

[[1]]
 [1] "You"        "come"       "at"         "four"       "in"         "the"        "afternoon," "then"       "at"         "three"      "I"          "shall"     
[13] "begin"      "to"         "be"         "happy"     

[[2]]
 [1] "One"     "runs"    "the"     "risk"    "of"      "weeping" "a"       "little," "if"      "one"     "lets"    "himself" "be"      "tamed"  

[[3]]
 [1] "what"      "makes"     "the"       "desert"    "beautiful" "is"        "that"      "somewhere" "it"        "hides"     "a"         "well"   

 

unique(fox.said.words)

[1] "what"      "is"        "essential" "invisible" "to"        "the"       "eye"

unique(tolower(fox.said.words))

[1] "what"      "is"        "essential" "invisible" "to"        "the"       "eye"   

 

# 분리된 벡터를 결합하기 위한 함수; paste

paste("Everybody", "wants", "to", "fly")

[1] "Everybody wants to fly"

 

paste(c("Everybody", "wants", "to", "fly"))

[1] "Everybody" "wants"     "to"        "fly" 

 

paste("Everybody", "wants", "to", "fly", sep="-")

[1] "Everybody-wants-to-fly"

paste("Everybody", "wants", "to", "fly", spe="")

[1] "Everybody wants to fly "

paste0("Everybody", "wants", "to", "fly")

[1] "Everybodywantstofly"

 

paste(pi, sqrt(pi))

[1] "3.14159265358979 1.77245385090552"

paste("25 degrees Celsius is", 25*1.8 + 32, "degree Fahrenheit")

[1] "25 degrees Celsius is 77 degree Fahrenheit"

 

heroes <- c("Batman", "Captain America", "Hulk")

colors <- c("Black", "Blue", "Green")

paste(heroes, colors)

[1] "Batman Black"         "Captain America Blue" "Hulk Green"  

 

paste("Type", 1:5)

[1] "Type 1" "Type 2" "Type 3" "Type 4" "Type 5"

 

paste(heroes, "wants", "to", "fly")

[1] "Batman wants to fly"    "Captain America wants to fly"     "Hulk wants to fly"  

 

paste(c("Everybody", "wants", "to", "fly"))

[1] "Everybody" "wants"     "to"        "fly"  

 

paste(c("Everybody", "wants", "to", "fly"), collapse=" ")

[1] "Everybody wants to fly"

 

 

paste(month.abb, 1:12)

 [1] "Jan 1"  "Feb 2"  "Mar 3"  "Apr 4"  "May 5"  "Jun 6"  "Jul 7"  "Aug 8"  "Sep 9"  "Oct 10" "Nov 11" "Dec 12"

paste(month.abb, 1:12, sep="_")

[1] "Jan_1"  "Feb_2"  "Mar_3"  "Apr_4"  "May_5"  "Jun_6"  "Jul_7"  "Aug_8"  "Sep_9"  "Oct_10" "Nov_11" "Dec_12"

paste(month.abb, 1:12, sep="_", collapse="-")

[1] "Jan_1-Feb_2-Mar_3-Apr_4-May_5-Jun_6-Jul_7-Aug_8-Sep_9-Oct_10-Nov_11-Dec_12"

 

 

# 가능한 모든 곱 계산을 하고 싶을 때는 outer

outer(c(1,2,3), c(1,2,3))

     [,1] [,2] [,3]
[1,]    1    2    3
[2,]    2    4    6
[3,]    3    6    9

 

 

# 문자 벡터의 outer 사용, FUN=paste

asian.countries <- c("Korea", "Japan", "China")

info <- c("GDP", "Population", "Area")

outer(asian.countries, info, FUN=paste, sep="-")

     [,1]        [,2]               [,3]        
[1,] "Korea-GDP" "Korea-Population" "Korea-Area"
[2,] "Japan-GDP" "Japan-Population" "Japan-Area"
[3,] "China-GDP" "China-Population" "China-Area"

 

out <- outer(asian.countries, asian.countries, FUN=paste, sep="-")

as.vector(out)

[1] "Korea-Korea" "Japan-Korea" "China-Korea" "Korea-Japan" "Japan-Japan" "China-Japan" "Korea-China" "Japan-China" "China-China"

 

x <- outer(asian.countries, asian.countries, FUN=paste, sep="-")

x[!lower.tri(x)]             # 중복 제거

[1] "Korea-Korea" "Korea-Japan" "Japan-Japan" "Korea-China" "Japan-China" "China-China"

 

 

customer <- "Jobs"

buysize <- 10

deliveryday <- 3

paste("Hello ", customer, ", your order of ", buysize, " product(s) will be delivered within ", deliveryday, "day(s).", sep="")

[1] "Hello Jobs your order of 10 product(s) will be delivered within 3 day(s)."

 

sprintf("Hello %s your order of %s product(s) will be delivered within %s day(s).", customer, buysize, deliveryday)

[1] "Hello Jobs your order of 10 product(s) will be delivered within 3 day(s)."

 

customer <- c("Jobs", "Gates", "Bezos")

buysize <- c(10, 7, 12)

deliveryday <- c(3, 2, 7.5)

sprintf("Hello %s your order of %s product(s) will be delivered within %s day(s).", customer, buysize, deliveryday)

[1] "Hello Jobs your order of 10 product(s) will be delivered within 3 day(s)."   
[2] "Hello Gates your order of 7 product(s) will be delivered within 2 day(s)."   
[3] "Hello Bezos your order of 12 product(s) will be delivered within 7.5 day(s)."

 

# 문자열 슬라이싱

substr("Data Analytics", start=1, stop=4)

[1] "Data"

substr("Data Analytics", start=6, stop=14)

[1] "Analytics"

substring("Data Analytics", 6)

[1] "Analytics"

 

class <- c("Data Analytics", "Data Mining", "Data Visualization")

substr(class, 1, 4)

[1] "Data" "Data" "Data"

 

# 국가코드 추출하기 (슬라이싱)

countries <- c("Korea, KR", "United States, US", "China, CN")

substr(countries, nchar(countries)-1, nchar(countries))

[1] "KR" "US" "CN"

 

 

head(islands)

landmasses <- names(islands)

landmasses

 [1] "Africa"           "Antarctica"       "Asia"             "Australia"        "Axel Heiberg"     "Baffin"          
 [7] "Banks"            "Borneo"           "Britain"          "Celebes"          "Celon"            "Cuba"            
[13] "Devon"            "Ellesmere"        "Europe"           "Greenland"        "Hainan"           "Hispaniola"      
[19] "Hokkaido"         "Honshu"           "Iceland"          "Ireland"          "Java"             "Kyushu"          
[25] "Luzon"            "Madagascar"       "Melville"         "Mindanao"         "Moluccas"         "New Britain"     
[31] "New Guinea"       "New Zealand (N)"  "New Zealand (S)"  "Newfoundland"     "North America"    "Novaya Zemlya"   
[37] "Prince of Wales"  "Sakhalin"         "South America"    "Southampton"      "Spitsbergen"      "Sumatra"         
[43] "Taiwan"           "Tasmania"         "Tierra del Fuego" "Timor"            "Vancouver"        "Victoria" 

 

grep(pattern="New", x=landmasses)

[1] 30 31 32 33 34

 

index <- grep(pattern="New", x=landmasses)

landmasses[index]

[1] "New Britain"     "New Guinea"      "New Zealand (N)" "New Zealand (S)" "Newfoundland"  

grep(pattern="New", x=landmasses, value=TRUE)

[1] "New Britain"     "New Guinea"      "New Zealand (N)" "New Zealand (S)" "Newfoundland" 

 

landmasses[grep(" ", landmasses)]

 [1] "Axel Heiberg"     "New Britain"      "New Guinea"       "New Zealand (N)"  "New Zealand (S)"  "North America"  
 [7] "Novaya Zemlya"    "Prince of Wales"  "South America"    "Tierra del Fuego"

grep(" ", landmasses, value=TRUE)

 [1] "Axel Heiberg"     "New Britain"      "New Guinea"       "New Zealand (N)"  "New Zealand (S)"  "North America"  
 [7] "Novaya Zemlya"    "Prince of Wales"  "South America"    "Tierra del Fuego"

 

txt <- "Data Analytics is useful. Data Analytics is also interesting."

sub(pattern="Data", replacement="Business", x=txt)

[1] "Business Analytics is useful. Data Analytics is also interesting."

gsub(pattern="Data", replacement="Business", x=txt)

[1] "Business Analytics is useful. Business Analytics is also interesting."

 

x <- c("product.csv", "customer.csv", "supplier.csv")

gsub(".csv", "", x)

[1] "product"  "customer" "supplier"

 

 

words <- c("at", "bat", "cat", "chaenomeles", "chase", "cheap","check", "cheese", "chick", "hat", "chasse")

 

grep("che", words, value=TRUE)

[1] "cheap"  "check"  "cheese"

grep("at", words, value=TRUE)

[1] "at"  "bat" "cat" "hat"

 

# REGEX; Regular Expression 정규식

grep("[ch]", words, value=TRUE)   # c 또는 h가 포함된 문자열

[1] "cat"         "chaenomeles" "chase"       "cheap"       "check"       "cheese"      "chick"       "hat"

grep("[at]", words, value=TRUE)    # a 또는 t가 포함된 문자열

[1] "at"          "bat"         "cat"         "chaenomeles" "chase"       "cheap"       "hat" 

 

 

# ch 또는 at를 포함하는 문자열 추출

grep("ch|at", words, value=TRUE)

[1] "at"          "bat"         "cat"         "chaenomeles" "chase"       "cheap"       "check"       "cheese" "chick"       "hat"

 

# ch로 시작하고 e 또는 i 가 다음에 나오고 ck로 문자열이 끝나는 문자열 추출

grep("ch(e|i)ck", words, value=TRUE)

[1] "check" "chick"

 

# cha 다음에 s가 있어도 되고 없어도 되지만 다음에 e가 나오는 문자열 추출 (regex에서 물음표(?)는 "있어도 되고 없어도 되고"를 뜻함

grep("chas?e", words, value=TRUE)

[1] "chaenomeles" "chase"  

 

# Regex에서 *는 0부터 그 이상 반복할 때

# cha다음 s가 1회 이상 가능하고 없어도 되고, 다음에는 e가 나옴

grep("chas*e", words, value=TRUE)

[1] "chaenomeles" "chase"       "chasse"

 

# s가 반드시 존재하는 문자열 추출

# Regext에서 +는 1회 이상을 뜻함

grep("chas+e", words, value=TRUE)

[1] "chase"  "chasse"

 

 

grep("ch(a*|e*)se", words, value=TRUE)

[1] "chase"  "cheese"

 

# C로 시작하는

grep("^c", words, value=TRUE)

[1] "cat"         "chaenomeles" "chase"       "cheap"       "check"       "cheese"      "chick"       "chasse"

 

# t로 끝나는

grep("t$", words, value=TRUE)

[1] "at"  "bat" "cat" "hat"

 

# h 또는 c로 시작을 하되 h 또는 c로 시작을 안해도 되고, 하지만 반드시 at로 끝나는 문자열

grep("^[hc]?at", words, value=TRUE)

[1] "at"  "cat" "hat"

 

 

words2 <- c("12 Dec", "OK", "http://", "<TITLE>Time?</TITLE>", "12345", "Hi there")

grep("[[:alnum:]]", words, value=TRUE)

 [1] "at"          "bat"         "cat"         "chaenomeles" "chase"       "cheap"       "check"       "cheese"     
 [9] "chick"       "hat"         "chasse" 

 

grep("[[:digit:]]", words2, value=TRUE)

[1] "12 Dec" "12345"

 

grep("[[:digit:]]", words2, value=TRUE)

[1] "12 Dec" "12345" 

grep("[[:alpha:]]", words2, value=TRUE)

1] "12 Dec"               "OK"                   "http://"              "<TITLE>Time?</TITLE>" "Hi there"

grep("[[:punct:]]", words2, value=TRUE)

[1] "http://"              "<TITLE>Time?</TITLE>"

grep("[[:space:]]", words2, value=TRUE)

[1] "12 Dec"   "Hi there"

grep("\\w+", words2, value=TRUE)

[1] "12 Dec"               "OK"                   "http://"              "<TITLE>Time?</TITLE>" "12345"               
[6] "Hi there"          

반응형

댓글