funcion melt en r
RemodelaciĆ³n complicada (8)
Quiero remodelar mi marco de datos de formato largo a ancho y pierdo algunos datos que me gustaría conservar. Para el siguiente ejemplo:
df <- data.frame(Par1 = unlist(strsplit("AABBCCC","")),
Par2 = unlist(strsplit("DDEEFFF","")),
ParD = unlist(strsplit("foo,bar,baz,qux,bla,xyz,meh",",")),
Type = unlist(strsplit("pre,post,pre,post,pre,post,post",",")),
Val = c(10,20,30,40,50,60,70))
# Par1 Par2 ParD Type Val
# 1 A D foo pre 10
# 2 A D bar post 20
# 3 B E baz pre 30
# 4 B E qux post 40
# 5 C F bla pre 50
# 6 C F xyz post 60
# 7 C F meh post 70
dfw <- dcast(df,
formula = Par1 + Par2 ~ Type,
value.var = "Val",
fun.aggregate = mean)
# Par1 Par2 post pre
# 1 A D 20 10
# 2 B E 40 30
# 3 C F 65 50
Esto es casi lo que necesito, pero me gustaría tenerlo.
- algunos datos de
ParD
campo del campoParD
(por ejemplo, como una sola cadena combinada), - Número de observaciones utilizadas para la agregación.
es decir, me gustaría que el data.frame resultante fuera como sigue:
# Par1 Par2 post pre Num.pre Num.post ParD
# 1 A D 20 10 1 1 foo_bar
# 2 B E 40 30 1 1 baz_qux
# 3 C F 65 50 1 2 bla_xyz_meh
Estaría agradecido por cualquier idea. Por ejemplo, traté de resolver la segunda tarea escribiendo en dcast: fun.aggregate=function(x) c(Val=mean(x),Num=length(x))
, pero esto causa un error.
¡Qué gran oportunidad para comparar! A continuación se muestran algunas ejecuciones del método plyr
(como lo sugiere @agstudy) en comparación con el método data.table
(como lo sugiere @Arun) utilizando diferentes tamaños de muestra (N = 900, 2700, 10800)
Resumen:
El método data.table
supera al método plyr
en un factor de 7.5
#-------------------#
# M E T H O D S #
#-------------------#
# additional methods below, in the updates
# Method 1 -- suggested by @agstudy
plyrMethod <- quote({
dfw<-dcast(df,
formula = Par1+Par2~Type,
value.var="Val",
fun.aggregate=mean)
dat <- ddply(df,.(Par1,Par2),function(x){
data.frame(ParD=paste(paste(x$ParD),collapse=''_''),
Num.pre =length(x$Type[x$Type ==''pre'']),
Num.post = length(x$Type[x$Type ==''post'']))
})
merge(dfw,dat)
})
# Method 2 -- suggested by @Arun
dtMethod <- quote(
dt[, list(pre=mean(Val[Type == "pre"]),
post=mean(Val[Type == "post"]),
Num.pre=length(Val[Type == "pre"]),
Num.post=length(Val[Type == "post"]),
ParD = paste(ParD, collapse="_")),
by=list(Par1, Par2)]
)
# Method 3 -- suggested by @regetz
reduceMethod <- quote(
Reduce(merge, list(
dcast(df, formula = Par1+Par2~Type, value.var="Val",
fun.aggregate=mean),
setNames(dcast(df, formula = Par1+Par2~Type, value.var="Val",
fun.aggregate=length), c("Par1", "Par2", "Num.post",
"Num.pre")),
aggregate(df["ParD"], df[c("Par1", "Par2")], paste, collapse="_")
))
)
# Method 4 -- suggested by @Ramnath
castddplyMethod <- quote(
reshape::cast(Par1 + Par2 + ParD ~ Type,
data = ddply(df, .(Par1, Par2), transform,
ParD = paste(ParD, collapse = "_")),
fun = c(mean, length)
)
)
# SAMPLE DATA #
#-------------#
library(data.table)
library(plyr)
library(reshape2)
library(rbenchmark)
# for Par1, ParD
LLL <- apply(expand.grid(LETTERS, LETTERS, LETTERS, stringsAsFactors=FALSE), 1, paste0, collapse="")
lll <- apply(expand.grid(letters, letters, letters, stringsAsFactors=FALSE), 1, paste0, collapse="")
# max size is 17568 with current sample data setup, ie: floor(length(LLL) / 18) * 18
size <- 17568
size <- 10800
size <- 900
set.seed(1)
df<-data.frame(Par1=rep(LLL[1:(size/2)], times=rep(c(2,2,3), size)[1:(size/2)])[1:(size)]
, Par2=rep(lll[1:(size/2)], times=rep(c(2,2,3), size)[1:(size/2)])[1:(size)]
, ParD=sample(unlist(lapply(c("f", "b"), paste0, lll)), size, FALSE)
, Type=rep(c("pre","post"), size/2)
, Val =sample(seq(10,100,10), size, TRUE)
)
dt <- data.table(df, key=c("Par1", "Par2"))
# Confirming Same Results #
#-------------------------#
# Evaluate
DF1 <- eval(plyrMethod)
DF2 <- eval(dtMethod)
# Convert to DF and sort columns and sort ParD levels, for use in identical
colOrder <- sort(names(DF1))
DF1 <- DF1[, colOrder]
DF2 <- as.data.frame(DF2)[, colOrder]
DF2$ParD <- factor(DF2$ParD, levels=levels(DF1$ParD))
identical((DF1), (DF2))
# [1] TRUE
#-------------------------#
RESULTADOS
#--------------------#
# BENCHMARK #
#--------------------#
benchmark(plyr=eval(plyrMethod), dt=eval(dtMethod), reduce=eval(reduceMethod), castddply=eval(castddplyMethod),
replications=5, columns=c("relative", "test", "elapsed", "user.self", "sys.self", "replications"),
order="relative")
# SAMPLE SIZE = 900
relative test elapsed user.self sys.self replications
1.000 reduce 0.392 0.375 0.018 5
1.003 dt 0.393 0.377 0.016 5
7.064 plyr 2.769 2.721 0.047 5
8.003 castddply 3.137 3.030 0.106 5
# SAMPLE SIZE = 2,700
relative test elapsed user.self sys.self replications
1.000 dt 1.371 1.327 0.090 5
2.205 reduce 3.023 2.927 0.102 5
7.291 plyr 9.996 9.644 0.377 5
# SAMPLE SIZE = 10,800
relative test elapsed user.self sys.self replications
1.000 dt 8.678 7.168 1.507 5
2.769 reduce 24.029 23.231 0.786 5
6.946 plyr 60.277 52.298 7.947 5
13.796 castddply 119.719 113.333 10.816 5
# SAMPLE SIZE = 17,568
relative test elapsed user.self sys.self replications
1.000 dt 27.421 13.042 14.470 5
4.030 reduce 110.498 75.853 34.922 5
5.414 plyr 148.452 105.776 43.156 5
Actualización: Resultados agregados para baseMethod1
# Used only sample size of 90, as it was taking long
relative test elapsed user.self sys.self replications
1.000 dt 0.044 0.043 0.001 5
7.773 plyr 0.342 0.339 0.003 5
65.614 base1 2.887 2.866 0.028 5
Where
baseMethod1 <- quote({
step1 <- with(df, split(df, list(Par1, Par2)))
step2 <- step1[sapply(step1, nrow) > 0]
step3 <- lapply(step2, function(x) {
piece1 <- tapply(x$Val, x$Type, mean)
piece2 <- tapply(x$Type, x$Type, length)
names(piece2) <- paste0("Num.", names(piece2))
out <- x[1, 1:2]
out[, 3:6] <- c(piece1, piece2)
names(out)[3:6] <- names(c(piece1, piece2))
out$ParD <- paste(unique(x$ParD), collapse="_")
out
})
data.frame(do.call(rbind, step3), row.names=NULL)
})
Actualización 2: Se agregó la clave al DT como parte de la métrica
Agregar el paso de indexación al punto de referencia para la imparcialidad según el comentario de @MatthewDowle.
Sin embargo, es de suponer que, si se usa data.table, estará en lugar de data.frame y, por lo tanto, la indexación se realizará una vez y no simplemente para este procedimiento.
dtMethod.withkey <- quote({
dt <- data.table(df, key=c("Par1", "Par2"))
dt[, list(pre=mean(Val[Type == "pre"]),
post=mean(Val[Type == "post"]),
Num.pre=length(Val[Type == "pre"]),
Num.post=length(Val[Type == "post"]),
ParD = paste(ParD, collapse="_")),
by=list(Par1, Par2)]
})
# SAMPLE SIZE = 10,800
relative test elapsed user.self sys.self replications
1.000 dt 9.155 7.055 2.137 5
1.043 dt.withkey 9.553 7.245 2.353 5
3.567 reduce 32.659 31.196 1.586 5
6.703 plyr 61.364 54.080 7.600 5
Actualización 3: Benchmarking @ MD''s edits a la respuesta original de @ Arun
dtMethod.MD1 <- quote(
dt[, list(pre=mean(Val[.pre <- Type=="pre"]), # save .pre
post=mean(Val[.post <- Type=="post"]), # save .post
pre.num=sum(.pre), # reuse .pre
post.num=sum(.post), # reuse .post
ParD = paste(ParD, collapse="_")),
by=list(Par1, Par2)]
)
dtMethod.MD2 <- quote(
dt[, { .pre <- Type=="pre" # or save .pre and .post up front
.post <- Type=="post"
list(pre=mean(Val[.pre]),
post=mean(Val[.post]),
pre.num=sum(.pre),
post.num=sum(.post),
ParD = paste(ParD, collapse="_")) }
, by=list(Par1, Par2)]
)
dtMethod.MD3 <- quote(
dt[, { .pre <- Type=="pre"
.post <- Type=="post"
list(pre=mean(Val[.pre]),
post=mean(Val[.post]),
pre.num=sum(.pre),
post.num=sum(.post),
ParD = list(ParD)) } # list() faster than paste()
, by=list(Par1, Par2)]
)
benchmark(dt.M1=eval(dtMethod.MD1), dt.M2=eval(dtMethod.MD2), dt.M3=eval(dtMethod.MD3), dt=eval(dtMethod),
replications=5, columns=c("relative", "test", "elapsed", "user.self", "sys.self", "replications"),
order="relative")
#--------------------#
Comparing the different data.table methods amongst themselves
# SAMPLE SIZE = 900
relative test elapsed user.self sys.self replications
1.000 dt.M3 0.198 0.197 0.001 5 <~~~ "list()" Method
1.242 dt.M1 0.246 0.243 0.004 5
1.253 dt.M2 0.248 0.242 0.007 5
1.884 dt 0.373 0.367 0.007 5
# SAMPLE SIZE = 17,568
relative test elapsed user.self sys.self replications
1.000 dt.M3 33.492 24.487 9.122 5 <~~~ "list()" Method
1.086 dt.M1 36.388 11.442 25.086 5
1.086 dt.M2 36.388 10.845 25.660 5
1.126 dt 37.701 13.256 24.535 5
Comparing MD3 ("list" method) with MD1 (best of DT non-list methods)
Using a clean session (ie, removing string cache)
_Note: Ran the following twice, fresh session each time, with practically identical results
Then re-ran in the *same* session, with reps=5. Results very different._
benchmark(dt.M1=eval(dtMethod.MD1), dt.M3=eval(dtMethod.MD3), replications=1, columns=c("relative", "test", "elapsed", "user.self", "sys.self", "replications"), order="relative")
# SAMPLE SIZE=17,568; CLEAN SESSION
relative test elapsed user.self sys.self replications
1.000 dt.M1 8.885 4.260 4.617 1
1.633 dt.M3 14.506 12.821 1.677 1
# SAMPLE SIZE=17,568; *SAME* SESSION
relative test elapsed user.self sys.self replications
1.000 dt.M1 33.443 10.200 23.226 5
1.048 dt.M3 35.060 26.127 8.915 5
#--------------------#
New benchmarks against previous methods
_Note: Not using the "list method" as results are not the same as other methods_
# SAMPLE SIZE = 900
relative test elapsed user.self sys.self replications
1.000 dt.M1 0.254 0.247 0.008 5
1.705 reduce 0.433 0.425 0.010 5
11.280 plyr 2.865 2.842 0.031 5
# SAMPLE SIZE = 17,568
relative test elapsed user.self sys.self replications
1.000 dt.M1 24.826 10.427 14.458 5
4.348 reduce 107.935 70.107 38.314 5
5.942 plyr 147.508 106.958 41.083 5
Creo que esta solución de base R es comparable con la solución de tabla de datos de @ Arun. (Lo que no quiere decir que lo prefiera, ¡ese código es mucho más simple!)
baseMethod2 <- quote({
is <- unname(split(1:nrow(df), with(df, paste(Par1, Par2, sep="/b"))))
i1 <- sapply(is, `[`, 1)
out <- with(df, data.frame(Par1=Par1[i1], Par2=Par2[i1]))
js <- lapply(is, function(i) split(i, df$Type[i]))
out$post <- sapply(js, function(j) mean(df$Val[j$post]))
out$pre <- sapply(js, function(j) mean(df$Val[j$pre]))
out$Num.pre <- sapply(js, function(j) length(j$pre))
out$Num.post <- sapply(js, function(j) length(j$post))
out$ParD <- sapply(is, function(x) paste(df$ParD[x], collapse="_"))
out
})
Usando el código de tiempo de @ RicardoSaporta con 900, 2700 y 10,800, respectivamente:
> relative test elapsed user.self sys.self replications
3 1.000 baseMethod2 0.230 0.229 0 5
1 1.130 dt 0.260 0.257 0 5
2 8.752 plyr 2.013 2.006 0 5
> relative test elapsed user.self sys.self replications
3 1.000 baseMethod2 0.877 0.872 0 5
1 1.068 dt 0.937 0.934 0 5
2 8.060 plyr 7.069 7.043 0 5
> relative test elapsed user.self sys.self replications
1 1.000 dt 6.232 6.178 0.031 5
3 1.085 baseMethod2 6.763 6.683 0.054 5
2 7.263 plyr 45.261 44.983 0.104 5
Intentar envolver diferentes expresiones de agregación en una función autocontenida (las expresiones deberían producir valores atómicos)
multi.by <- function(X, INDEX,...) {
expressions <- substitute(...())
duplicates <- duplicated(INDEX)
res <- do.call(rbind,sapply(split(X,cumsum(!duplicates),drop=T), function(part)
sapply(expressions,eval,part,simplify=F),simplify=F))
if (is.data.frame(INDEX)) res <- cbind(INDEX[!duplicates,],res)
else rownames(res) <- INDEX[!duplicates]
res
}
multi.by(df,df[,1:2],
pre=mean(Val[Type=="pre"]),
post=mean(Val[Type=="post"]),
Num.pre=sum(Type=="pre"),
Num.post=sum(Type=="post"),
ParD=paste(ParD, collapse="_"))
Podría hacer una combinación de dos dcasts y un agregado, aquí todo envuelto en una gran expresión principalmente para evitar tener objetos intermedios dando vueltas después:
Reduce(merge, list(
dcast(df, formula = Par1+Par2~Type, value.var="Val",
fun.aggregate=mean),
setNames(dcast(df, formula = Par1+Par2~Type, value.var="Val",
fun.aggregate=length), c("Par1", "Par2", "Num.post",
"Num.pre")),
aggregate(df["ParD"], df[c("Par1", "Par2")], paste, collapse="_")
))
Solución One Step que combina reshape::cast
con plyr::ddply
cast(Par1 + Par2 + ParD ~ Type, data = ddply(df, .(Par1, Par2), transform,
ParD = paste(ParD, collapse = "_")), fun = c(mean, length))
TENGA dcast
CUENTA que la función de reshape2
en reshape2
no permite que se pasen múltiples funciones agregadas, mientras que la función de reshape2
en reshape
sí lo hace.
Solución en 2 pasos usando ddply
(no estoy contento con, pero obtengo el resultado)
dat <- ddply(df,.(Par1,Par2),function(x){
data.frame(ParD=paste(paste(x$ParD),collapse=''_''),
Num.pre =length(x$Type[x$Type ==''pre'']),
Num.post = length(x$Type[x$Type ==''post'']))
})
merge(dfw,dat)
Par1 Par2 post pre ParD Num.pre Num.post
1 A D 2.0 1 foo_bar 1 1
2 B E 4.0 3 baz_qux 1 1
3 C F 6.5 5 bla_xyz_meh 1 2
Tarde a la fiesta, pero aquí hay otra alternativa usando data.table
:
require(data.table)
dt <- data.table(df, key=c("Par1", "Par2"))
dt[, list(pre=mean(Val[Type == "pre"]),
post=mean(Val[Type == "post"]),
pre.num=length(Val[Type == "pre"]),
post.num=length(Val[Type == "post"]),
ParD = paste(ParD, collapse="_")),
by=list(Par1, Par2)]
# Par1 Par2 pre post pre.num post.num ParD
# 1: A D 10 20 1 1 foo_bar
# 2: B E 30 40 1 1 baz_qux
# 3: C F 50 65 1 2 bla_xyz_meh
[de Matthew] +1 Algunas mejoras menores para guardar la repetición del mismo ==
, y para demostrar las variables locales dentro de j
.
dt[, list(pre=mean(Val[.pre <- Type=="pre"]), # save .pre
post=mean(Val[.post <- Type=="post"]), # save .post
pre.num=sum(.pre), # reuse .pre
post.num=sum(.post), # reuse .post
ParD = paste(ParD, collapse="_")),
by=list(Par1, Par2)]
# Par1 Par2 pre post pre.num post.num ParD
# 1: A D 10 20 1 1 foo_bar
# 2: B E 30 40 1 1 baz_qux
# 3: C F 50 65 1 2 bla_xyz_meh
dt[, { .pre <- Type=="pre" # or save .pre and .post up front
.post <- Type=="post"
list(pre=mean(Val[.pre]),
post=mean(Val[.post]),
pre.num=sum(.pre),
post.num=sum(.post),
ParD = paste(ParD, collapse="_")) }
, by=list(Par1, Par2)]
# Par1 Par2 pre post pre.num post.num ParD
# 1: A D 10 20 1 1 foo_bar
# 2: B E 30 40 1 1 baz_qux
# 3: C F 50 65 1 2 bla_xyz_meh
Y si una columna de list
está bien en lugar de paste
, entonces esto debería ser más rápido:
dt[, { .pre <- Type=="pre"
.post <- Type=="post"
list(pre=mean(Val[.pre]),
post=mean(Val[.post]),
pre.num=sum(.pre),
post.num=sum(.post),
ParD = list(ParD)) } # list() faster than paste()
, by=list(Par1, Par2)]
# Par1 Par2 pre post pre.num post.num ParD
# 1: A D 10 20 1 1 foo,bar
# 2: B E 30 40 1 1 baz,qux
# 3: C F 50 65 1 2 bla,xyz,meh
Voy a publicar, pero Agstudy''s me avergüenza:
step1 <- with(df, split(df, list(Par1, Par2)))
step2 <- step1[sapply(step1, nrow) > 0]
step3 <- lapply(step2, function(x) {
piece1 <- tapply(x$Val, x$Type, mean)
piece2 <- tapply(x$Type, x$Type, length)
names(piece2) <- paste0("Num.", names(piece2))
out <- x[1, 1:2]
out[, 3:6] <- c(piece1, piece2)
names(out)[3:6] <- names(c(piece1, piece2))
out$ParD <- paste(unique(x$ParD), collapse="_")
out
})
data.frame(do.call(rbind, step3), row.names=NULL)
Flexible:
Par1 Par2 post pre Num.post Num.pre ParD
1 A D 2.0 1 1 1 foo_bar
2 B E 4.0 3 1 1 baz_qux
3 C F 6.5 5 2 1 bla_xyz_meh