LearningR-数据处理

1. R自带函数

2. reshape2
data restructuring

3. dplyr
data aggregation

4. tidyr
待整理

5. 字符串处理

1. R自带函数

1.1 转置

``````cars <- mtcars(1:5,1:4)
cars
t(cars)``````

``````x <- array(1:24, 2:4)
xt <- aperm(x, c(2,1,3))
dim(x)
dim(xt)``````

1.2 整合数据aggregate

``aggregate(x,by,FUN)``

``````options(digits=2)
attach(mtcars)
mydata <- aggregate(mtcars, by=list(cyl,gear), FUN=mean, na.rm=TRUE)
mydata``````

by中的变量必须在一个列表中（即使只有一个变量）。也可以在列表中为各组声明自定义的名称，例如by＝list(Group.cyl=cyl,Group.gears=gear)。

``````## example with character variables and NAs
testDF <- data.frame(v1 = c(1,3,5,7,8,3,5,NA,4,5,7,9),
v2 = c(11,33,55,77,88,33,55,NA,44,55,77,99) )
by1 <- c("red", "blue", 1, 2, NA, "big", 1, 2, "red", 1, NA, 12)
by2 <- c("wet", "dry", 99, 95, NA, "damp", 95, 99, "red", 99, NA, NA)
aggregate(x = testDF, by = list(by1, by2), FUN = "mean")

# and if you want to treat NAs as a group
fby1 <- factor(by1, exclude = "")
fby2 <- factor(by2, exclude = "")
aggregate(x = testDF, by = list(fby1, fby2), FUN = "mean")

## Formulas, one ~ one, one ~ many, many ~ one, and many ~ many:
aggregate(weight ~ feed, data = chickwts, mean)
aggregate(breaks ~ wool + tension, data = warpbreaks, mean)
aggregate(cbind(Ozone, Temp) ~ Month, data = airquality, mean)
aggregate(cbind(ncases, ncontrols) ~ alcgp + tobgp, data = esoph, sum)

## Dot notation:
aggregate(. ~ Species, data = iris, mean)
aggregate(len ~ ., data = ToothGrowth, mean)

## Often followed by xtabs():
ag <- aggregate(len ~ ., data = ToothGrowth, mean)
xtabs(len ~ ., data = ag)

## Compute the average annual approval ratings for American presidents.
aggregate(presidents, nfrequency = 1, FUN = mean)
## Give the summer less weight.
aggregate(presidents, nfrequency = 1,
FUN = weighted.mean, w = c(1, 1, 0.5, 1))``````

1.4 union和intersect

``````x <- c(sort(sample(1:20, 9)), NA)
y <- c(sort(sample(3:23, 7)), NA)
union(x, y)
intersect(x, y)
setdiff(x, y)
setdiff(y, x)
setequal(x, y)
#%in%
(1:10) %in% c(3,7,12)
"%w/o%" <- function(x, y) x[!x %in% y]
(1:10) %w/o% c(3,7,12)
sstr <- c("c","ab","B","bba","c",NA,"@","bla","a","Ba","%")
sstr %in% c(letters, LETTERS)``````

1.5 合并 cbind和rbind

• rbind() ：纵向合并两个数据框（数据集）

• cbind() ：横向合并两个数据框（数据集）

(1)删除dataframeA中的多余变量；

(2)在dataframeB中创建追加的变量并将其值设为NA(缺失)。

``````x1 <- c(1:5)
x2 <- c(21:25)
x3 <- c(31:35)
r1 <- cbind(x1, x2)
r2 <- rbind(x1, x2)
r31 <- cbind(r1, x3)
r32 <- rbind(r2, x3)``````

1.6 匹配合并 merge

merge效果同dplyr的join，join的效力更高。

• inner_join 等价于 merge(all=F)

• left_join 等价于 merge(all.x=T, all.y=F)

• right_join 等价于 merge(all.x=F, all.y=T)

• full_join 等价于 merge(all=T)

``````#authors和books
authors <- data.frame(
surname = I(c("Tukey", "Venables", "Tierney", "Ripley", "McNeil")),
nationality = c("US", "Australia", "US", "UK", "Australia"),
deceased = c("yes", rep("no", 4)))
books <- data.frame(
name = I(c("Tukey", "Venables", "Tierney",
"Ripley", "Ripley", "McNeil", "R Core")),
title = c("Exploratory Data Analysis",
"Modern Applied Statistics ...",
"LISP-STAT",
"Spatial Statistics", "Stochastic Simulation",
"Interactive Data Analysis",
"An Introduction to R"),
other.author = c(NA, "Ripley", NA, NA, NA, NA,
"Venables & Smith"))

m1 <- merge(authors, books, by.x = "surname", by.y = "name")
m2 <- merge(books, authors, by.x = "name", by.y = "surname")
#m1和m2结果相同，只是结果的列名不同。
#left_join
m3 <- merge(authors, books, by.x = "surname", by.y = "name", all.x = T, all.y = F)
#right_join
m4 <- merge(authors, books, by.x = "surname", by.y = "name", all.x = F, all.y = T)
#full_join
m5 <- merge(authors, books, by.x = "surname", by.y = "name", all = TRUE)

m11 <- inner_join(authors, books, by=c("surname"="name"))
m22 <- inner_join(books, authors, by=c("name"="surname"))
m33 <- left_join(authors, books, by=c("surname"="name"))
m44 <- right_join(authors, books, by=c("surname"="name"))
m55 <- full_join(authors, books, by=c("surname"="name"))``````

1.7 排除重复数据 unique

unique 函数可以去掉向量、数据框或类似数列的数据中重复的元素。

``````x <- c(9:20, 1:5, 3:7, 0:8)
y <- unique(x)
#下列方式业可以，但unique方式效率更高.
#duplicated 函数返回了元素是否重复的逻辑值.
y1 <- x[!duplicated(x)]``````

2. reshape2包

``````#数据集mydata
ID <- c(1,1,2,2)
Time <- c(1,2,1,2)
X1 <- c(5,3,6,2)
X2 <- c(6,5,1,4)
mydata <- data.frame(ID,Time,X1,X2)``````

2.1融合-melt

``````library(reshape2)
md <- melt(mydata, id=c("ID","Time"))
md <- melt(mydata, id=1:2)``````

2.2重铸-dcast和acast

Use acast or dcast depending on whether you want vector/matrix/array output or data frame output. Data frames can have at most two dimensions.

1. dcast——返回的结果是一个数据框

2. acast——返回的结果可以是向量、矩阵或者数组

``````newdata <- dcast(data, formula, fun.aggregate = NULL, ...,
margins = NULL, subset = NULL, fill = NULL, drop = TRUE,
value.var = guess_value(data))
newdata <- acast(data, formula, fun.aggregate = NULL, ...,
margins = NULL, subset = NULL, fill = NULL, drop = TRUE,
value.var = guess_value(data))``````

``rowvar1 + rowvar2 + ... ~ colvar1 + colvar2 + ...``

``````#执行整合
acast(md, ID~variable, mean)
dcast(md, ID~variable, mean)
dcast(md, tTime~variable, mean)
dcast(md, ID~Time, mean)
#不执行整合
dcast(md, ID+Time~variable)
dcast(md, ID+variable~Time)
dcast(md, ID~variable+Time)``````

2.3 练习

``````library(reshape2)
mydata <- airquality
mydata1 <- melt(mydata, id=c("Month", "Day"),
variable.name = "type",value.name = "val")
#选定测量变量为Ozone、Wind
mydata2 <- melt(mydata, id=c("Month", "Day"),
measure = c("Ozone","Wind"),
variable.name = "type",value.name = "val")
str(mydata1)
str(mydata2)
#大写转换为小写
names(mydata) <- tolower(names(mydata))
a <- melt(mydata, id=c("month", "day"), na.rm=TRUE)
#数据b和原始数据airquality一样，数据复原了。
b <- dcast(a , month + day ~variable)
result1 <- dcast(a , month  ~variable ,mean)
#查看缺失值数量的函数
myfun <- function(x){return(sum(is.na(x)))}
result2 <- dcast(a, month  ~variable ,myfun)
result3 <- melt(mydata, id=c("month", "day"))
result4 <- dcast(result3 , month  ~variable ,myfun)
result5 <- recast(mydata , month ~ variable ,
id.var = c('month','day') , fun = myfun)``````

3. dplyr

3.1 基本操作

3.1.1 数据类型

``````library(dplyr)
iris_df <- tbl_df(iris)``````

3.1.2 筛选filter

``````filter(iris_df, Species == 'setosa' , Sepal.Length >=5)
filter(iris_df, Species == 'setosa' & Sepal.Length >=5)``````

``iris_df[iris_df\$Species == 'setosa' & iris_df\$Sepal.Length >=5, ]``

``filter(iris_df, Species == 'setosa' | Sepal.Length >=5)``

3.1.3 排列 arrange

``````arrange(iris_df, Sepal.Length, Sepal.Width)
arrange(iris_df, desc(Sepal.Length))
#这个函数和 plyr::arrange() 是一样的, 类似于 order()``````

``````iris_df[order(iris_df\$Sepal.Length, iris_df\$Sepal.Width), ]
iris_df[order(desc(iris_df\$Sepal.Length)), ]``````

3.1.4 选择select

``````select(iris_df, 5, 1:2)
select(iris_df, Species, Sepal.Length, Sepal.Width)
select(iris, Species, everything())
#重命名列名
select(iris_df, Species, Length=Sepal.Length, Width=Sepal.Width)
select(iris_df, petal = starts_with("Petal"))``````

``select(iris_df, -Petal.Length, -Petal.Width)``

select的特殊函数

• starts_with(x, ignore.case = TRUE): names starts with x

• ends_with(x, ignore.case = TRUE): names ends in x

• contains(x, ignore.case = TRUE): selects all variables whose name contains

• matches(x, ignore.case = TRUE): selects all variables whose name matches the regular expression x

• num_range("x", 1:5, width = 2): selects all variables (numerically) from x01 to x05.

• one_of("x", "y", "z"): selects variables provided in a character vector.

• everything(): selects all variables.

``````select(iris_df, everything())
select(iris_df, starts_with("Petal"))
select(iris_df, ends_with("Width"))
select(iris_df, contains("etal"))
select(iris_df, matches(".t."))
#选取名称符合指定表达式规则的列
select(iris_df, Sepal.Length:Petal.Width)
select(iris_df, Petal.Length, Petal.Width)
vars <- c("Petal.Length", "Petal.Width")
select(iris_df, one_of(vars))
df <- as.data.frame(matrix(runif(100), nrow = 10))
df <- tbl_df(df)
select(df, V4:V6)
select(df, num_range("V", 4:6))``````

":" 选择连续列，contains来匹配列名

``````subset(iris,select=c(1,2))
subset(iris,select=c(3,4))
subset(iris,select=c(Petal.Length, Petal.Width))``````

Programming with select 存疑??

``````select_(iris_df, ~Petal.Length)
select_(iris_df, "Petal.Length")
select_(iris_df, lazyeval::interp(~matches(x), x = ".t."))
select_(iris_df, quote(-Petal.Length), quote(-Petal.Width))
select_(iris_df, .dots = list(quote(-Petal.Length), quote(-Petal.Width)))``````

3.1.5 添加新变量mutate

``````mtcars_df <- tbl_df(mtcars)
mutate(mtcars_df, displ_l = disp / 61.0237)
#transmute结果只有计算的字段
transmute(mtcars_df, displ_l = disp / 61.0237)``````

mutate_each()

``mutate_each(iris, funs(min_rank))``

plyr::mutate() 与 base::transform() 相似, 优势在于可以在同一语句中对刚增加的列进行操作。

``````mutate(hflights_df,
gain = ArrDelay - DepDelay,
gain_per_hour = gain / (AirTime / 60)
)
#而同样操作用R自带函数 transform() 的话就会报错:
transform(hflights,
gain = ArrDelay - DepDelay,
gain_per_hour = gain / (AirTime / 60)
)``````

``mtcars_df <- data.frame(mtcars_df,displ_l = mtcars_df\$disp / 61.0237)``

3.1.6 汇总summarise

``````summarise(mtcars_df, mean(disp, na.rm = TRUE), n())
summarise(group_by(mtcars_df, cyl), mean(disp), n())
summarise(group_by(mtcars_df, cyl), m = mean(disp), sd = sd(disp))
#对每⼀一列运⾏行概述函数。
summarise_each(iris, funs(mean))
by_species <- iris %>% group_by(Species)
by_species %>% summarise_each(funs(length))
by_species %>% summarise_each(funs(mean))
by_species %>% summarise_each(funs(mean), Petal.Width)
by_species %>% summarise_each(funs(mean), matches("Width"))``````

count()

``````#计算各变量中每⼀一个特定值的⾏行数(带权重或不带权重)。
count(iris, Species, wt = Sepal.Length)
count(iris, Species, mycount = n())``````

3.1.7 tally

``````mtcars %>%
group_by(cyl, vs) %>%
tally(sort = TRUE)
#与下列方式相同
mtcars %>%
group_by(cyl, vs) %>%
summarise(n = n()) %>%
arrange(cyl,vs,n)``````

3.2 分组group_by

``````summarise(mtcars_df, mean(disp, na.rm = TRUE), n())
summarise(group_by(mtcars_df, cyl), mean(disp), n(),n_distinct(gear))
summarise(group_by(mtcars_df, cyl), m = mean(disp), sd = sd(disp))
#a mutate/rename followed by a simple group_by
group_by(mtcars_df, vsam = vs + am)
group_by(mtcars_df, vs2 = vs)
summarise(group_by(mtcars_df, cyl2=cyl), m = mean(disp), sd = sd(disp))``````

n(): 计算个数
n_distinct(x): 计算 x 中唯一值的个数

3.3 链式操作(管道) %>% 或 %.%

dplyr包还新引进了一个操作符，读成then，使用时把数据名作为开头, 然后依次对此数据进行多步操作。比如:

``````mtcars %>%
group_by(cyl) %>%
summarise(total = sum(disp)) %>%
arrange(desc(total)) %>%
(x1-x2)^2%>%sum()%>%sqrt()``````

``````head(arrange(summarise(group_by(mtcars, cyl), total = sum(disp)) , desc(total)), 5)
x1 <- 1:5
x2 <- 2:6
sqrt(sum((x1-x2)^2))``````

``````totals <- aggregate(. ~ cyl, data=mtcars[,c("cyl","disp")], sum)
ranks <- sort.list(-totals\$disp)
#ranks <- order(-totals\$disp)
totals[ranks[1:5],]``````

3.5 数据匹配合并join

• inner_join(x, y) ：只包含同时出现在x,y表中的行

• left_join(x, y) ：包含所有x中以及y中匹配的行

• semi_join(x, y) ：包含x中，在y中有匹配的行，结果为x的子集

• anti_join(x, y) ：包含x中，不匹配y的行，结果为x的子集，与semi_join相反

• full_join(x, y) ：包含所以x、y中的行

• right_join(x, y) ：包含所有y中以及x中匹配的行

``````x <- data.frame(name = c("John", "Paul", "George", "Ringo", "Stuart", "Pete"),
instrument = c("guitar", "bass", "guitar", "drums", "bass","drums"))
y <- data.frame(name = c("John", "Paul", "George", "Ringo", "Brian"),
band = c("TRUE", "TRUE", "TRUE", "TRUE", "FALSE"))
inner_join(x, y)
left_join(x, y)
semi_join(x, y)
anti_join(x, y)
full_join(x, y)
right_join(x,y)``````

3.6 连接数据库

• dplyr 可以连接数据库

• 使用与本地数据框操作一样的语法

• 只支持生成SELECT语句

• 支持SQLite, PostgreSQL/Redshift, MySQL/MariaDB, BigQuery, MonetDB

3.7 利用窗体函数变换数据

dplyr::lag 把除第一个值以外的所有元素延后，第一个元素为NA
dplyr::dense_rank 无缝排序
dplyr::min_rank 排序。并列时，其他序号顺延
dplyr::percent_rank 把数据在[0,1]中充足并排列
dplyr::row_number 排序。并列时，位置在前的并列数据序号在前
dplyr::ntile 把向量分为n份
dplyr::between 数据是否在a和b之间
dplyr::cume_dist 累计分布
dplyr::cumal 累计all函数
dplyr::cumany 累计any函数
dplyr::cummean 累计mean函数
cumsum 累计sum函数
cummax 累计max函数
cummin 累计min函数
cumprod 累计prod函数
pmax 针对元素的max函数
pmin 针对元素的min函数

（先挖坑...）

5. 字符串处理

5.1 字符个数 nchar

nchar()能够获取字符串的长度，它和length()的结果是有区别的。

``````nchar(c("abc", "abcd"))    ＃求字符串中的字符个数，返回向量c(3, 4)
length(c("abc", "abcd"))  ＃返回2，向量中元素的个数``````

5.2 连接字符 paste

paste()不仅可以连接多个字符串，还可以将对象自动转换为字符串再相连，另外它还能处理向量，所以功能更强大。

``````paste("fitbit", month, ".jpg", sep="")
paste("fitbit", 1:12, ".jpg", sep = "")``````

paste默认的分隔符是空格，必须指定sep=""。还有一个collapse参数，可以把这些字符串拼成一个长字符串，而不是放在一个向量中。

``paste("fitbit", 1:3, ".jpg", sep = "", collapse = "; ")``

5.3 分割字符 strsplit

``strsplit(x, split, fixed = FALSE, perl = FALSE, useBytes = FALSE)``
``````x <- c(as = "asfef", qu = "qwerty", "yuiop[", "b", "stuff.blah.yech")
strsplit(x,"e")
#需要注意的细节
strsplit(paste(c("", "a", "")
strsplit("", " ")[[1]]
strsplit(" ", " ")[[1]]

##倒序运用：
strReverse <- function(x)
sapply(lapply(strsplit(x, NULL), rev), paste, collapse = "")
strReverse(c("abc", "Statistics"))``````

5.4 提取字符 substr与substring

``````substr(x, start, stop)
substring(text, first, last = 1000000L)
substr(x, start, stop) <- value
substring(text, first, last = 1000000L) <- value``````
``````substr("abcdef", 2, 4)
substring("abcdef", 1:6, 1:6)

substr(rep("abcdef", 4), 1:4, 4:5)
x <- c("asfef", "qwerty", "yuiop[", "b", "stuff.blah.yech")
substr(x, 2, 5)
substring(x, 2, 4:6)
substring(x, 2) <- c("..", "+++")``````

5.5 替换字符 sub和gsub

• sub 只做一次替换（不管有几次匹配）

• gsub 把满足条件的匹配都做替换

``````sub(pattern, replacement, x, ignore.case = FALSE, perl = FALSE,
fixed = FALSE, useBytes = FALSE)
gsub(pattern, replacement, x, ignore.case = FALSE, perl = FALSE,
fixed = FALSE, useBytes = FALSE)``````

``````text <- "Hello Adam!\nHello Ava!"
text

sub和gsub函数可以使用提取表达式（转义字符+数字）让部分变成全部

``````sub(pattern=".*(Adam).*", replacement="\\1", text)
str <- "Now is the time      "
sub(" +\$", "", str)
sub("[[:space:]]+\$", "", str)
sub("\\s+\$", "", str, perl = TRUE)
txt <- "a test of capitalizing"
gsub("(\\w)(\\w*)", "\\U\\1\\L\\2", txt, perl=TRUE)
gsub("\\b(\\w)",    "\\U\\1",       txt, perl=TRUE)``````

5.6 字符查询匹配 grep

• grep 返回匹配项的下标

• grepl 返回所有查询结果的逻辑向量

• regexpr

• gregexpr

• regexec
regexpr、gregexpr和regexec这三个函数返回的结果包含了匹配的具体位置和字符串长度信息，可以用于字符串的提取操作。

``````x <- c("abc","abcdef","def")
grep("def", x)
#grep返回匹配项的下标
#grepl返回所有查询结果的逻辑向量。两者的结果都可用于提取数据子集
grepl("def", x)
regexpr、gregexpr和regexec``````

5.5 其他

• 大小写转换 tolower与toupper

• 列表转换为向量unlist
`unlist(x, recursive = TRUE, use.names = TRUE)`

• 重复输入rep()

``````rep(1:4, 2)
rep(1:4, each = 2)
rep(1:4, c(2,2,2,2))
rep(1:4, c(2,1,2,1))
rep(1:4, each = 2, len = 4)
rep(1:4, each = 2, len = 10)
rep(1:4, each = 2, times = 3) ``````

5.6 stringr包

stringr包是用来处理字符串的。(先挖坑...)

R语言学习笔记

16 人关注
14 篇文章