R 기초; 텍스트
x <- "We have a dream"
nchar(x) # 문자열의 캐릭터 개수
[1] 15
length(x) # 원소의 개수
[1] 1
y <-c ("we", "have", "a", "dream")
y
[1] "we" "have" "a" "dream"
nchar(y)
[1] 2 4 1 5
length(y)
[1] 4
nchar(y[4])
[1] 5
letters
sort(letters, decreasing = TRUE)
[1] "z" "y" "x" "w" "v" "u" "t" "s" "r" "q" "p" "o" "n" "m" "l" "k" "j"
[18] "i" "h" "g" "f" "e" "d" "c" "b" "a"
fox.says <- "It is only with the HEART that one can See Rightly"
tolower(fox.says)
[1] "it is only with the heart that one can see rightly"
toupper(fox.says)
[1] "IT IS ONLY WITH THE HEART THAT ONE CAN SEE RIGHTLY"
fox.said <- "what is essential is invisible to the eye"
strsplit(fox.said, split=" ")
[[1]]
[1] "what" "is" "essential" "is" "invisible"
[6] "to" "the" "eye"
strsplit(fox.said, split="")
[[1]]
[1] "w" "h" "a" "t" " " "i" "s" " " "e" "s" "s" "e" "n" "t" "i" "a" "l"
[18] " " "i" "s" " " "i" "n" "v" "i" "s" "i" "b" "l" "e" " " "t" "o" " "
[35] "t" "h" "e" " " "e" "y" "e"
fox.said.words <- unlist(strsplit(fox.said, split=" "))
fox.said.words
[1] "what" "is" "essential" "is" "invisible"
[6] "to" "the" "eye"
fox.said.words[3]
[1] "essential"
strsplit(fox.said, split=" ")[[1]]
[1] "what" "is" "essential" "is" "invisible"
[6] "to" "the" "eye"
strsplit(fox.said, split=" ")[[1]][[3]]
[1] "essential"
p1 <- "You come at four in the afternoon, then at three I shall begin to be happy"
p2 <- "One runs the risk of weeping a little, if one lets himself be tamed"
p3 <- "what makes the desert beautiful is that somewhere it hides a well"
littleprince <- c(p1, p2, p3)
littleprince
strsplit(littleprince, " ")
[[1]]
[1] "You" "come" "at" "four" "in" "the" "afternoon," "then" "at" "three" "I" "shall"
[13] "begin" "to" "be" "happy"
[[2]]
[1] "One" "runs" "the" "risk" "of" "weeping" "a" "little," "if" "one" "lets" "himself" "be" "tamed"
[[3]]
[1] "what" "makes" "the" "desert" "beautiful" "is" "that" "somewhere" "it" "hides" "a" "well"
unique(fox.said.words)
[1] "what" "is" "essential" "invisible" "to" "the" "eye"
unique(tolower(fox.said.words))
[1] "what" "is" "essential" "invisible" "to" "the" "eye"
# 분리된 벡터를 결합하기 위한 함수; paste
paste("Everybody", "wants", "to", "fly")
[1] "Everybody wants to fly"
paste(c("Everybody", "wants", "to", "fly"))
[1] "Everybody" "wants" "to" "fly"
paste("Everybody", "wants", "to", "fly", sep="-")
[1] "Everybody-wants-to-fly"
paste("Everybody", "wants", "to", "fly", spe="")
[1] "Everybody wants to fly "
paste0("Everybody", "wants", "to", "fly")
[1] "Everybodywantstofly"
paste(pi, sqrt(pi))
[1] "3.14159265358979 1.77245385090552"
paste("25 degrees Celsius is", 25*1.8 + 32, "degree Fahrenheit")
[1] "25 degrees Celsius is 77 degree Fahrenheit"
heroes <- c("Batman", "Captain America", "Hulk")
colors <- c("Black", "Blue", "Green")
paste(heroes, colors)
[1] "Batman Black" "Captain America Blue" "Hulk Green"
paste("Type", 1:5)
[1] "Type 1" "Type 2" "Type 3" "Type 4" "Type 5"
paste(heroes, "wants", "to", "fly")
[1] "Batman wants to fly" "Captain America wants to fly" "Hulk wants to fly"
paste(c("Everybody", "wants", "to", "fly"))
[1] "Everybody" "wants" "to" "fly"
paste(c("Everybody", "wants", "to", "fly"), collapse=" ")
[1] "Everybody wants to fly"
paste(month.abb, 1:12)
[1] "Jan 1" "Feb 2" "Mar 3" "Apr 4" "May 5" "Jun 6" "Jul 7" "Aug 8" "Sep 9" "Oct 10" "Nov 11" "Dec 12"
paste(month.abb, 1:12, sep="_")
[1] "Jan_1" "Feb_2" "Mar_3" "Apr_4" "May_5" "Jun_6" "Jul_7" "Aug_8" "Sep_9" "Oct_10" "Nov_11" "Dec_12"
paste(month.abb, 1:12, sep="_", collapse="-")
[1] "Jan_1-Feb_2-Mar_3-Apr_4-May_5-Jun_6-Jul_7-Aug_8-Sep_9-Oct_10-Nov_11-Dec_12"
# 가능한 모든 곱 계산을 하고 싶을 때는 outer
outer(c(1,2,3), c(1,2,3))
[,1] [,2] [,3]
[1,] 1 2 3
[2,] 2 4 6
[3,] 3 6 9
# 문자 벡터의 outer 사용, FUN=paste
asian.countries <- c("Korea", "Japan", "China")
info <- c("GDP", "Population", "Area")
outer(asian.countries, info, FUN=paste, sep="-")
[,1] [,2] [,3]
[1,] "Korea-GDP" "Korea-Population" "Korea-Area"
[2,] "Japan-GDP" "Japan-Population" "Japan-Area"
[3,] "China-GDP" "China-Population" "China-Area"
out <- outer(asian.countries, asian.countries, FUN=paste, sep="-")
as.vector(out)
[1] "Korea-Korea" "Japan-Korea" "China-Korea" "Korea-Japan" "Japan-Japan" "China-Japan" "Korea-China" "Japan-China" "China-China"
x <- outer(asian.countries, asian.countries, FUN=paste, sep="-")
x[!lower.tri(x)] # 중복 제거
[1] "Korea-Korea" "Korea-Japan" "Japan-Japan" "Korea-China" "Japan-China" "China-China"
customer <- "Jobs"
buysize <- 10
deliveryday <- 3
paste("Hello ", customer, ", your order of ", buysize, " product(s) will be delivered within ", deliveryday, "day(s).", sep="")
[1] "Hello Jobs your order of 10 product(s) will be delivered within 3 day(s)."
sprintf("Hello %s your order of %s product(s) will be delivered within %s day(s).", customer, buysize, deliveryday)
[1] "Hello Jobs your order of 10 product(s) will be delivered within 3 day(s)."
customer <- c("Jobs", "Gates", "Bezos")
buysize <- c(10, 7, 12)
deliveryday <- c(3, 2, 7.5)
sprintf("Hello %s your order of %s product(s) will be delivered within %s day(s).", customer, buysize, deliveryday)
[1] "Hello Jobs your order of 10 product(s) will be delivered within 3 day(s)."
[2] "Hello Gates your order of 7 product(s) will be delivered within 2 day(s)."
[3] "Hello Bezos your order of 12 product(s) will be delivered within 7.5 day(s)."
# 문자열 슬라이싱
substr("Data Analytics", start=1, stop=4)
[1] "Data"
substr("Data Analytics", start=6, stop=14)
[1] "Analytics"
substring("Data Analytics", 6)
[1] "Analytics"
class <- c("Data Analytics", "Data Mining", "Data Visualization")
substr(class, 1, 4)
[1] "Data" "Data" "Data"
# 국가코드 추출하기 (슬라이싱)
countries <- c("Korea, KR", "United States, US", "China, CN")
substr(countries, nchar(countries)-1, nchar(countries))
[1] "KR" "US" "CN"
head(islands)
landmasses <- names(islands)
landmasses
[1] "Africa" "Antarctica" "Asia" "Australia" "Axel Heiberg" "Baffin"
[7] "Banks" "Borneo" "Britain" "Celebes" "Celon" "Cuba"
[13] "Devon" "Ellesmere" "Europe" "Greenland" "Hainan" "Hispaniola"
[19] "Hokkaido" "Honshu" "Iceland" "Ireland" "Java" "Kyushu"
[25] "Luzon" "Madagascar" "Melville" "Mindanao" "Moluccas" "New Britain"
[31] "New Guinea" "New Zealand (N)" "New Zealand (S)" "Newfoundland" "North America" "Novaya Zemlya"
[37] "Prince of Wales" "Sakhalin" "South America" "Southampton" "Spitsbergen" "Sumatra"
[43] "Taiwan" "Tasmania" "Tierra del Fuego" "Timor" "Vancouver" "Victoria"
grep(pattern="New", x=landmasses)
[1] 30 31 32 33 34
index <- grep(pattern="New", x=landmasses)
landmasses[index]
[1] "New Britain" "New Guinea" "New Zealand (N)" "New Zealand (S)" "Newfoundland"
grep(pattern="New", x=landmasses, value=TRUE)
[1] "New Britain" "New Guinea" "New Zealand (N)" "New Zealand (S)" "Newfoundland"
landmasses[grep(" ", landmasses)]
[1] "Axel Heiberg" "New Britain" "New Guinea" "New Zealand (N)" "New Zealand (S)" "North America"
[7] "Novaya Zemlya" "Prince of Wales" "South America" "Tierra del Fuego"
grep(" ", landmasses, value=TRUE)
[1] "Axel Heiberg" "New Britain" "New Guinea" "New Zealand (N)" "New Zealand (S)" "North America"
[7] "Novaya Zemlya" "Prince of Wales" "South America" "Tierra del Fuego"
txt <- "Data Analytics is useful. Data Analytics is also interesting."
sub(pattern="Data", replacement="Business", x=txt)
[1] "Business Analytics is useful. Data Analytics is also interesting."
gsub(pattern="Data", replacement="Business", x=txt)
[1] "Business Analytics is useful. Business Analytics is also interesting."
x <- c("product.csv", "customer.csv", "supplier.csv")
gsub(".csv", "", x)
[1] "product" "customer" "supplier"
words <- c("at", "bat", "cat", "chaenomeles", "chase", "cheap","check", "cheese", "chick", "hat", "chasse")
grep("che", words, value=TRUE)
[1] "cheap" "check" "cheese"
grep("at", words, value=TRUE)
[1] "at" "bat" "cat" "hat"
# REGEX; Regular Expression 정규식
grep("[ch]", words, value=TRUE) # c 또는 h가 포함된 문자열
[1] "cat" "chaenomeles" "chase" "cheap" "check" "cheese" "chick" "hat"
grep("[at]", words, value=TRUE) # a 또는 t가 포함된 문자열
[1] "at" "bat" "cat" "chaenomeles" "chase" "cheap" "hat"
# ch 또는 at를 포함하는 문자열 추출
grep("ch|at", words, value=TRUE)
[1] "at" "bat" "cat" "chaenomeles" "chase" "cheap" "check" "cheese" "chick" "hat"
# ch로 시작하고 e 또는 i 가 다음에 나오고 ck로 문자열이 끝나는 문자열 추출
grep("ch(e|i)ck", words, value=TRUE)
[1] "check" "chick"
# cha 다음에 s가 있어도 되고 없어도 되지만 다음에 e가 나오는 문자열 추출 (regex에서 물음표(?)는 "있어도 되고 없어도 되고"를 뜻함
grep("chas?e", words, value=TRUE)
[1] "chaenomeles" "chase"
# Regex에서 *는 0부터 그 이상 반복할 때
# cha다음 s가 1회 이상 가능하고 없어도 되고, 다음에는 e가 나옴
grep("chas*e", words, value=TRUE)
[1] "chaenomeles" "chase" "chasse"
# s가 반드시 존재하는 문자열 추출
# Regext에서 +는 1회 이상을 뜻함
grep("chas+e", words, value=TRUE)
[1] "chase" "chasse"
grep("ch(a*|e*)se", words, value=TRUE)
[1] "chase" "cheese"
# C로 시작하는
grep("^c", words, value=TRUE)
[1] "cat" "chaenomeles" "chase" "cheap" "check" "cheese" "chick" "chasse"
# t로 끝나는
grep("t$", words, value=TRUE)
[1] "at" "bat" "cat" "hat"
# h 또는 c로 시작을 하되 h 또는 c로 시작을 안해도 되고, 하지만 반드시 at로 끝나는 문자열
grep("^[hc]?at", words, value=TRUE)
[1] "at" "cat" "hat"
words2 <- c("12 Dec", "OK", "http://", "<TITLE>Time?</TITLE>", "12345", "Hi there")
grep("[[:alnum:]]", words, value=TRUE)
[1] "at" "bat" "cat" "chaenomeles" "chase" "cheap" "check" "cheese"
[9] "chick" "hat" "chasse"
grep("[[:digit:]]", words2, value=TRUE)
[1] "12 Dec" "12345"
grep("[[:digit:]]", words2, value=TRUE)
[1] "12 Dec" "12345"
grep("[[:alpha:]]", words2, value=TRUE)
1] "12 Dec" "OK" "http://" "<TITLE>Time?</TITLE>" "Hi there"
grep("[[:punct:]]", words2, value=TRUE)
[1] "http://" "<TITLE>Time?</TITLE>"
grep("[[:space:]]", words2, value=TRUE)
[1] "12 Dec" "Hi there"
grep("\\w+", words2, value=TRUE)
[1] "12 Dec" "OK" "http://" "<TITLE>Time?</TITLE>" "12345"
[6] "Hi there"
'공부 > R Programming' 카테고리의 다른 글
R 기초; 웹 스크레이핑 stringr 패키지 (0) | 2021.01.15 |
---|---|
R 기초; 웹스크레이핑 base 패키지 (0) | 2021.01.14 |
R 기초; 데이터프레임 인덱싱-3 (0) | 2021.01.10 |
R기초; 데이터프레임 인덱싱 - 2 (0) | 2021.01.10 |
R기초; 데이터프레임 인덱싱 - 1 (0) | 2021.01.09 |
댓글