R 기초; 텍스트

x <- "We have a dream"

nchar(x) # 문자열의 캐릭터 개수

[1] 15

length(x) # 원소의 개수

[1] 1

y <-c ("we", "have", "a", "dream")

[1] "we" "have" "a" "dream"

nchar(y)

[1] 2 4 1 5

length(y)

[1] 4

nchar(y[4])

[1] 5

letters

sort(letters, decreasing = TRUE)

[1] "z" "y" "x" "w" "v" "u" "t" "s" "r" "q" "p" "o" "n" "m" "l" "k" "j"
[18] "i" "h" "g" "f" "e" "d" "c" "b" "a"

fox.says <- "It is only with the HEART that one can See Rightly"

tolower(fox.says)

[1] "it is only with the heart that one can see rightly"

toupper(fox.says)

[1] "IT IS ONLY WITH THE HEART THAT ONE CAN SEE RIGHTLY"

fox.said <- "what is essential is invisible to the eye"

strsplit(fox.said, split=" ")

[[1]]
[1] "what" "is" "essential" "is" "invisible"
[6] "to" "the" "eye"

strsplit(fox.said, split="")

[[1]]
[1] "w" "h" "a" "t" " " "i" "s" " " "e" "s" "s" "e" "n" "t" "i" "a" "l"
[18] " " "i" "s" " " "i" "n" "v" "i" "s" "i" "b" "l" "e" " " "t" "o" " "
[35] "t" "h" "e" " " "e" "y" "e"

fox.said.words <- unlist(strsplit(fox.said, split=" "))

fox.said.words

[1] "what" "is" "essential" "is" "invisible"
[6] "to" "the" "eye"

fox.said.words[3]

[1] "essential"

strsplit(fox.said, split=" ")[[1]]

[1] "what" "is" "essential" "is" "invisible"
[6] "to" "the" "eye"

strsplit(fox.said, split=" ")[[1]][[3]]

[1] "essential"

p1 <- "You come at four in the afternoon, then at three I shall begin to be happy"

p2 <- "One runs the risk of weeping a little, if one lets himself be tamed"

p3 <- "what makes the desert beautiful is that somewhere it hides a well"

littleprince <- c(p1, p2, p3)

littleprince

strsplit(littleprince, " ")

[[1]]
[1] "You"        "come"       "at"         "four"       "in"         "the"        "afternoon," "then"       "at"         "three"      "I"          "shall"
[13] "begin"      "to"         "be"         "happy"

[[2]]
[1] "One"     "runs"    "the"     "risk"    "of"      "weeping" "a"       "little," "if"      "one"     "lets"    "himself" "be"      "tamed"

[[3]]
[1] "what"      "makes"     "the"       "desert"    "beautiful" "is"        "that"      "somewhere" "it"        "hides"     "a"         "well"

unique(fox.said.words)

[1] "what" "is" "essential" "invisible" "to" "the" "eye"

unique(tolower(fox.said.words))

[1] "what" "is" "essential" "invisible" "to" "the" "eye"

# 분리된 벡터를 결합하기 위한 함수; paste

paste("Everybody", "wants", "to", "fly")

[1] "Everybody wants to fly"

paste(c("Everybody", "wants", "to", "fly"))

[1] "Everybody" "wants" "to" "fly"

paste("Everybody", "wants", "to", "fly", sep="-")

[1] "Everybody-wants-to-fly"

paste("Everybody", "wants", "to", "fly", spe="")

[1] "Everybody wants to fly "

paste0("Everybody", "wants", "to", "fly")

[1] "Everybodywantstofly"

paste(pi, sqrt(pi))

[1] "3.14159265358979 1.77245385090552"

paste("25 degrees Celsius is", 25*1.8 + 32, "degree Fahrenheit")

[1] "25 degrees Celsius is 77 degree Fahrenheit"

heroes <- c("Batman", "Captain America", "Hulk")

colors <- c("Black", "Blue", "Green")

paste(heroes, colors)

[1] "Batman Black" "Captain America Blue" "Hulk Green"

paste("Type", 1:5)

[1] "Type 1" "Type 2" "Type 3" "Type 4" "Type 5"

paste(heroes, "wants", "to", "fly")

[1] "Batman wants to fly" "Captain America wants to fly" "Hulk wants to fly"

paste(c("Everybody", "wants", "to", "fly"))

[1] "Everybody" "wants" "to" "fly"

paste(c("Everybody", "wants", "to", "fly"), collapse=" ")

[1] "Everybody wants to fly"

paste(month.abb, 1:12)

[1] "Jan 1" "Feb 2" "Mar 3" "Apr 4" "May 5" "Jun 6" "Jul 7" "Aug 8" "Sep 9" "Oct 10" "Nov 11" "Dec 12"

paste(month.abb, 1:12, sep="_")

[1] "Jan_1" "Feb_2" "Mar_3" "Apr_4" "May_5" "Jun_6" "Jul_7" "Aug_8" "Sep_9" "Oct_10" "Nov_11" "Dec_12"

paste(month.abb, 1:12, sep="_", collapse="-")

[1] "Jan_1-Feb_2-Mar_3-Apr_4-May_5-Jun_6-Jul_7-Aug_8-Sep_9-Oct_10-Nov_11-Dec_12"

# 가능한 모든 곱 계산을 하고 싶을 때는 outer

outer(c(1,2,3), c(1,2,3))

     [,1] [,2] [,3]
[1,]    1    2    3
[2,]    2    4    6
[3,]    3    6    9

# 문자 벡터의 outer 사용, FUN=paste

asian.countries <- c("Korea", "Japan", "China")

info <- c("GDP", "Population", "Area")

outer(asian.countries, info, FUN=paste, sep="-")

[,1] [,2] [,3]
[1,] "Korea-GDP" "Korea-Population" "Korea-Area"
[2,] "Japan-GDP" "Japan-Population" "Japan-Area"
[3,] "China-GDP" "China-Population" "China-Area"

out <- outer(asian.countries, asian.countries, FUN=paste, sep="-")

as.vector(out)

[1] "Korea-Korea" "Japan-Korea" "China-Korea" "Korea-Japan" "Japan-Japan" "China-Japan" "Korea-China" "Japan-China" "China-China"

x <- outer(asian.countries, asian.countries, FUN=paste, sep="-")

x[!lower.tri(x)] # 중복 제거

[1] "Korea-Korea" "Korea-Japan" "Japan-Japan" "Korea-China" "Japan-China" "China-China"

customer <- "Jobs"

buysize <- 10

deliveryday <- 3

paste("Hello ", customer, ", your order of ", buysize, " product(s) will be delivered within ", deliveryday, "day(s).", sep="")

[1] "Hello Jobs your order of 10 product(s) will be delivered within 3 day(s)."

sprintf("Hello %s your order of %s product(s) will be delivered within %s day(s).", customer, buysize, deliveryday)

[1] "Hello Jobs your order of 10 product(s) will be delivered within 3 day(s)."

customer <- c("Jobs", "Gates", "Bezos")

buysize <- c(10, 7, 12)

deliveryday <- c(3, 2, 7.5)

sprintf("Hello %s your order of %s product(s) will be delivered within %s day(s).", customer, buysize, deliveryday)

[1] "Hello Jobs your order of 10 product(s) will be delivered within 3 day(s)."
[2] "Hello Gates your order of 7 product(s) will be delivered within 2 day(s)."
[3] "Hello Bezos your order of 12 product(s) will be delivered within 7.5 day(s)."

# 문자열 슬라이싱

substr("Data Analytics", start=1, stop=4)

[1] "Data"

substr("Data Analytics", start=6, stop=14)

[1] "Analytics"

substring("Data Analytics", 6)

[1] "Analytics"

class <- c("Data Analytics", "Data Mining", "Data Visualization")

substr(class, 1, 4)

[1] "Data" "Data" "Data"

# 국가코드 추출하기 (슬라이싱)

countries <- c("Korea, KR", "United States, US", "China, CN")

substr(countries, nchar(countries)-1, nchar(countries))

[1] "KR" "US" "CN"

head(islands)

landmasses <- names(islands)

landmasses

[1] "Africa"           "Antarctica"       "Asia"             "Australia"        "Axel Heiberg"     "Baffin"
[7] "Banks"            "Borneo"           "Britain"          "Celebes"          "Celon"            "Cuba"
[13] "Devon"            "Ellesmere"        "Europe"           "Greenland"        "Hainan"           "Hispaniola"
[19] "Hokkaido"         "Honshu"           "Iceland"          "Ireland"          "Java"             "Kyushu"
[25] "Luzon"            "Madagascar"       "Melville"         "Mindanao"         "Moluccas"         "New Britain"
[31] "New Guinea"       "New Zealand (N)"  "New Zealand (S)"  "Newfoundland"     "North America"    "Novaya Zemlya"
[37] "Prince of Wales"  "Sakhalin"         "South America"    "Southampton"      "Spitsbergen"      "Sumatra"
[43] "Taiwan"           "Tasmania"         "Tierra del Fuego" "Timor"            "Vancouver"        "Victoria"

grep(pattern="New", x=landmasses)

[1] 30 31 32 33 34

index <- grep(pattern="New", x=landmasses)

landmasses[index]

[1] "New Britain" "New Guinea" "New Zealand (N)" "New Zealand (S)" "Newfoundland"

grep(pattern="New", x=landmasses, value=TRUE)

[1] "New Britain" "New Guinea" "New Zealand (N)" "New Zealand (S)" "Newfoundland"

landmasses[grep(" ", landmasses)]

[1] "Axel Heiberg" "New Britain" "New Guinea" "New Zealand (N)" "New Zealand (S)" "North America"
[7] "Novaya Zemlya" "Prince of Wales" "South America" "Tierra del Fuego"

grep(" ", landmasses, value=TRUE)

[1] "Axel Heiberg" "New Britain" "New Guinea" "New Zealand (N)" "New Zealand (S)" "North America"
[7] "Novaya Zemlya" "Prince of Wales" "South America" "Tierra del Fuego"

txt <- "Data Analytics is useful. Data Analytics is also interesting."

sub(pattern="Data", replacement="Business", x=txt)

[1] "Business Analytics is useful. Data Analytics is also interesting."

gsub(pattern="Data", replacement="Business", x=txt)

[1] "Business Analytics is useful. Business Analytics is also interesting."

x <- c("product.csv", "customer.csv", "supplier.csv")

gsub(".csv", "", x)

[1] "product" "customer" "supplier"

words <- c("at", "bat", "cat", "chaenomeles", "chase", "cheap","check", "cheese", "chick", "hat", "chasse")

grep("che", words, value=TRUE)

[1] "cheap" "check" "cheese"

grep("at", words, value=TRUE)

[1] "at" "bat" "cat" "hat"

# REGEX; Regular Expression 정규식

grep("[ch]", words, value=TRUE) # c 또는 h가 포함된 문자열

[1] "cat" "chaenomeles" "chase" "cheap" "check" "cheese" "chick" "hat"

grep("[at]", words, value=TRUE) # a 또는 t가 포함된 문자열

[1] "at" "bat" "cat" "chaenomeles" "chase" "cheap" "hat"

# ch 또는 at를 포함하는 문자열 추출

grep("ch|at", words, value=TRUE)

[1] "at" "bat" "cat" "chaenomeles" "chase" "cheap" "check" "cheese" "chick" "hat"

# ch로 시작하고 e 또는 i 가 다음에 나오고 ck로 문자열이 끝나는 문자열 추출

grep("ch(e|i)ck", words, value=TRUE)

[1] "check" "chick"

# cha 다음에 s가 있어도 되고 없어도 되지만 다음에 e가 나오는 문자열 추출 (regex에서 물음표(?)는 "있어도 되고 없어도 되고"를 뜻함

grep("chas?e", words, value=TRUE)

[1] "chaenomeles" "chase"

# Regex에서 *는 0부터 그 이상 반복할 때

# cha다음 s가 1회 이상 가능하고 없어도 되고, 다음에는 e가 나옴

grep("chas*e", words, value=TRUE)

[1] "chaenomeles" "chase" "chasse"

# s가 반드시 존재하는 문자열 추출

# Regext에서 +는 1회 이상을 뜻함

grep("chas+e", words, value=TRUE)

[1] "chase" "chasse"

grep("ch(a*|e*)se", words, value=TRUE)

[1] "chase" "cheese"

# C로 시작하는

grep("^c", words, value=TRUE)

[1] "cat" "chaenomeles" "chase" "cheap" "check" "cheese" "chick" "chasse"

# t로 끝나는

grep("t$", words, value=TRUE)

[1] "at" "bat" "cat" "hat"

# h 또는 c로 시작을 하되 h 또는 c로 시작을 안해도 되고, 하지만 반드시 at로 끝나는 문자열

grep("^[hc]?at", words, value=TRUE)

[1] "at" "cat" "hat"

words2 <- c("12 Dec", "OK", "http://", "<TITLE>Time?</TITLE>", "12345", "Hi there")

grep("[[:alnum:]]", words, value=TRUE)

[1] "at" "bat" "cat" "chaenomeles" "chase" "cheap" "check" "cheese"
[9] "chick" "hat" "chasse"

grep("[[:digit:]]", words2, value=TRUE)

[1] "12 Dec" "12345"

grep("[[:digit:]]", words2, value=TRUE)

[1] "12 Dec" "12345"

grep("[[:alpha:]]", words2, value=TRUE)

1] "12 Dec" "OK" "http://" "<TITLE>Time?</TITLE>" "Hi there"

grep("[[:punct:]]", words2, value=TRUE)

[1] "http://" "<TITLE>Time?</TITLE>"

grep("[[:space:]]", words2, value=TRUE)

[1] "12 Dec" "Hi there"

grep("\\w+", words2, value=TRUE)

[1] "12 Dec" "OK" "http://" "<TITLE>Time?</TITLE>" "12345"
[6] "Hi there"

'공부 > R Programming' 카테고리의 다른 글

R 기초; 웹 스크레이핑 stringr 패키지 (0)	2021.01.15
R 기초; 웹스크레이핑 base 패키지 (0)	2021.01.14
R 기초; 데이터프레임 인덱싱-3 (0)	2021.01.10
R기초; 데이터프레임 인덱싱 - 2 (0)	2021.01.10
R기초; 데이터프레임 인덱싱 - 1 (0)	2021.01.09

혼밥맨

R 기초; 텍스트