資料處理技巧(2)_R語言－風中一匹狼

#資料處理技巧(2)
install.packages(c("magrittr","tidyr","dplyr"))
install.packages("tidyverse") #上面三個包都在其中
library(tidyverse)
cars %>% summary() #把cars叫進summary()。等同summary(cars)
#應用運算符號%>%
birth<- 1995
age<- Sys.Date() %>%
format(format= "%Y") %>%
as.numeric() %>%
`-` (birth)
#調整輸入位置
cars_lm<- lm(formula = dist~ speed, data= cars) #傳統方式
cars_lm<- cars %>%
lm(formula = dist~ speed, data = .) #以. 指定資料輸入的位置

#gather() 將多個數值變數堆積在同一個數值變數中(value)，再用一個類別變數(key)紀錄數值變數的來源。
team_name<- c("Bull", "Warrior")
wins<- c(72,73)
losses<- c(10,9)
team<- data.frame(team_name,wins,losses)
team
gather(team, key = variable, value = values, wins, losses)

#filter() 篩選資料
filter(team, team_name== "Bull")
team[team_name=="Bull", ] #內建寫法

#select() 篩選特定變數
select(team, wins)
team[,"wins", drop= F] #內建寫法，drop= F為不轉為vector

#mutate() 新增衍生變數或非衍生變數
season<- c("1995-96", "2015-16")
mutate(team,
winning_percentage= wins/(wins+losses),
season= season)

#arrange() 利用指定的變數排序觀測值
arrange(team, losses) #遞增
arrange(team, desc(losses)) #遞減

#summarise() 聚合某項變數觀測值進行運算
summarise(team, var(losses))
#group_by() 搭配summarise()和 %>%
team_gather<- gather(team, key = variable, value = values, wins, losses)
group_by(team_gather, team_name) %>%
summarise(mean(values)) %>%
as.data.frame()

#運用函數於資料框 (加速資料運算)
weight<- ceiling(runif(500000)*50) + 40
height<- ceiling(runif(500000)*50) + 140
h_w<- data.frame(height,weight) #製成身高與體重的資料框
bmi<- rep(NA, times= nrow(h_w)) #做出空的vector，輸入結果
for (i in 1:nrow(h_w)) {
bmi[i]<- h_w[i, "weight"] / (h_w[i, "height"] / 100)^2
}
system.time(for (i in 1:nrow(h_w)) {
bmi[i]<- h_w[i, "weight"] / (h_w[i, "height"] / 100)^2
}) #可用system.time()得知運算時間，約花13秒
options(digits=7) #顯示小數點後幾位
system.time(
bmi<- h_w$weight/ (h_w$height/100)^2
) #用向量計算加快到0.02秒

distinct_counts<- function(x){
unique_values<- unique(x)
return(length(unique_values))
} #計算資料有幾個變數
apply(iris, MARGIN = 2, distinct_counts) #MARGIN = 2指定函數(distinct_counts)應用在變數欄(column)，1則是觀測值列
lapply(iris, FUN= distinct_counts) #回傳成list
sapply(iris, FUN = distinct_counts) #回傳成vecor，簡化資料
tapply(iris$Sepal.Length, INDEX = iris$Species, FUN = distinct_counts) #依照species，分別找出相異的Sepal.Length有幾個

#練習題: 以上面的身高體重例子，用mapply計算bmi
bmi<- mapply(
FUN = function(height, weight){
return(weight / (height / 100)^2)
},
h_w$height,h_w$weight
) #函數需輸入多個變數(身高、體重)時，可使用mapply完成

提供R script參考( 格式為 CP950)

參考書籍: 輕鬆學習R語言：從基礎到應用，掌握資料科學的關鍵能力