funcion r dataframe reshape2

funcion melt en r



RemodelaciĆ³n complicada (8)

Quiero remodelar mi marco de datos de formato largo a ancho y pierdo algunos datos que me gustaría conservar. Para el siguiente ejemplo:

df <- data.frame(Par1 = unlist(strsplit("AABBCCC","")), Par2 = unlist(strsplit("DDEEFFF","")), ParD = unlist(strsplit("foo,bar,baz,qux,bla,xyz,meh",",")), Type = unlist(strsplit("pre,post,pre,post,pre,post,post",",")), Val = c(10,20,30,40,50,60,70)) # Par1 Par2 ParD Type Val # 1 A D foo pre 10 # 2 A D bar post 20 # 3 B E baz pre 30 # 4 B E qux post 40 # 5 C F bla pre 50 # 6 C F xyz post 60 # 7 C F meh post 70 dfw <- dcast(df, formula = Par1 + Par2 ~ Type, value.var = "Val", fun.aggregate = mean) # Par1 Par2 post pre # 1 A D 20 10 # 2 B E 40 30 # 3 C F 65 50

Esto es casi lo que necesito, pero me gustaría tenerlo.

  1. algunos datos de ParD campo del campo ParD (por ejemplo, como una sola cadena combinada),
  2. Número de observaciones utilizadas para la agregación.

es decir, me gustaría que el data.frame resultante fuera como sigue:

# Par1 Par2 post pre Num.pre Num.post ParD # 1 A D 20 10 1 1 foo_bar # 2 B E 40 30 1 1 baz_qux # 3 C F 65 50 1 2 bla_xyz_meh

Estaría agradecido por cualquier idea. Por ejemplo, traté de resolver la segunda tarea escribiendo en dcast: fun.aggregate=function(x) c(Val=mean(x),Num=length(x)) , pero esto causa un error.


¡Qué gran oportunidad para comparar! A continuación se muestran algunas ejecuciones del método plyr (como lo sugiere @agstudy) en comparación con el método data.table (como lo sugiere @Arun) utilizando diferentes tamaños de muestra (N = 900, 2700, 10800)

Resumen:
El método data.table supera al método plyr en un factor de 7.5

#-------------------# # M E T H O D S # #-------------------# # additional methods below, in the updates # Method 1 -- suggested by @agstudy plyrMethod <- quote({ dfw<-dcast(df, formula = Par1+Par2~Type, value.var="Val", fun.aggregate=mean) dat <- ddply(df,.(Par1,Par2),function(x){ data.frame(ParD=paste(paste(x$ParD),collapse=''_''), Num.pre =length(x$Type[x$Type ==''pre'']), Num.post = length(x$Type[x$Type ==''post''])) }) merge(dfw,dat) }) # Method 2 -- suggested by @Arun dtMethod <- quote( dt[, list(pre=mean(Val[Type == "pre"]), post=mean(Val[Type == "post"]), Num.pre=length(Val[Type == "pre"]), Num.post=length(Val[Type == "post"]), ParD = paste(ParD, collapse="_")), by=list(Par1, Par2)] ) # Method 3 -- suggested by @regetz reduceMethod <- quote( Reduce(merge, list( dcast(df, formula = Par1+Par2~Type, value.var="Val", fun.aggregate=mean), setNames(dcast(df, formula = Par1+Par2~Type, value.var="Val", fun.aggregate=length), c("Par1", "Par2", "Num.post", "Num.pre")), aggregate(df["ParD"], df[c("Par1", "Par2")], paste, collapse="_") )) ) # Method 4 -- suggested by @Ramnath castddplyMethod <- quote( reshape::cast(Par1 + Par2 + ParD ~ Type, data = ddply(df, .(Par1, Par2), transform, ParD = paste(ParD, collapse = "_")), fun = c(mean, length) ) ) # SAMPLE DATA # #-------------# library(data.table) library(plyr) library(reshape2) library(rbenchmark) # for Par1, ParD LLL <- apply(expand.grid(LETTERS, LETTERS, LETTERS, stringsAsFactors=FALSE), 1, paste0, collapse="") lll <- apply(expand.grid(letters, letters, letters, stringsAsFactors=FALSE), 1, paste0, collapse="") # max size is 17568 with current sample data setup, ie: floor(length(LLL) / 18) * 18 size <- 17568 size <- 10800 size <- 900 set.seed(1) df<-data.frame(Par1=rep(LLL[1:(size/2)], times=rep(c(2,2,3), size)[1:(size/2)])[1:(size)] , Par2=rep(lll[1:(size/2)], times=rep(c(2,2,3), size)[1:(size/2)])[1:(size)] , ParD=sample(unlist(lapply(c("f", "b"), paste0, lll)), size, FALSE) , Type=rep(c("pre","post"), size/2) , Val =sample(seq(10,100,10), size, TRUE) ) dt <- data.table(df, key=c("Par1", "Par2")) # Confirming Same Results # #-------------------------# # Evaluate DF1 <- eval(plyrMethod) DF2 <- eval(dtMethod) # Convert to DF and sort columns and sort ParD levels, for use in identical colOrder <- sort(names(DF1)) DF1 <- DF1[, colOrder] DF2 <- as.data.frame(DF2)[, colOrder] DF2$ParD <- factor(DF2$ParD, levels=levels(DF1$ParD)) identical((DF1), (DF2)) # [1] TRUE #-------------------------#

RESULTADOS

#--------------------# # BENCHMARK # #--------------------# benchmark(plyr=eval(plyrMethod), dt=eval(dtMethod), reduce=eval(reduceMethod), castddply=eval(castddplyMethod), replications=5, columns=c("relative", "test", "elapsed", "user.self", "sys.self", "replications"), order="relative") # SAMPLE SIZE = 900 relative test elapsed user.self sys.self replications 1.000 reduce 0.392 0.375 0.018 5 1.003 dt 0.393 0.377 0.016 5 7.064 plyr 2.769 2.721 0.047 5 8.003 castddply 3.137 3.030 0.106 5 # SAMPLE SIZE = 2,700 relative test elapsed user.self sys.self replications 1.000 dt 1.371 1.327 0.090 5 2.205 reduce 3.023 2.927 0.102 5 7.291 plyr 9.996 9.644 0.377 5 # SAMPLE SIZE = 10,800 relative test elapsed user.self sys.self replications 1.000 dt 8.678 7.168 1.507 5 2.769 reduce 24.029 23.231 0.786 5 6.946 plyr 60.277 52.298 7.947 5 13.796 castddply 119.719 113.333 10.816 5 # SAMPLE SIZE = 17,568 relative test elapsed user.self sys.self replications 1.000 dt 27.421 13.042 14.470 5 4.030 reduce 110.498 75.853 34.922 5 5.414 plyr 148.452 105.776 43.156 5

Actualización: Resultados agregados para baseMethod1

# Used only sample size of 90, as it was taking long relative test elapsed user.self sys.self replications 1.000 dt 0.044 0.043 0.001 5 7.773 plyr 0.342 0.339 0.003 5 65.614 base1 2.887 2.866 0.028 5 Where baseMethod1 <- quote({ step1 <- with(df, split(df, list(Par1, Par2))) step2 <- step1[sapply(step1, nrow) > 0] step3 <- lapply(step2, function(x) { piece1 <- tapply(x$Val, x$Type, mean) piece2 <- tapply(x$Type, x$Type, length) names(piece2) <- paste0("Num.", names(piece2)) out <- x[1, 1:2] out[, 3:6] <- c(piece1, piece2) names(out)[3:6] <- names(c(piece1, piece2)) out$ParD <- paste(unique(x$ParD), collapse="_") out }) data.frame(do.call(rbind, step3), row.names=NULL) })

Actualización 2: Se agregó la clave al DT como parte de la métrica

Agregar el paso de indexación al punto de referencia para la imparcialidad según el comentario de @MatthewDowle.
Sin embargo, es de suponer que, si se usa data.table, estará en lugar de data.frame y, por lo tanto, la indexación se realizará una vez y no simplemente para este procedimiento.

dtMethod.withkey <- quote({ dt <- data.table(df, key=c("Par1", "Par2")) dt[, list(pre=mean(Val[Type == "pre"]), post=mean(Val[Type == "post"]), Num.pre=length(Val[Type == "pre"]), Num.post=length(Val[Type == "post"]), ParD = paste(ParD, collapse="_")), by=list(Par1, Par2)] }) # SAMPLE SIZE = 10,800 relative test elapsed user.self sys.self replications 1.000 dt 9.155 7.055 2.137 5 1.043 dt.withkey 9.553 7.245 2.353 5 3.567 reduce 32.659 31.196 1.586 5 6.703 plyr 61.364 54.080 7.600 5

Actualización 3: Benchmarking @ MD''s edits a la respuesta original de @ Arun

dtMethod.MD1 <- quote( dt[, list(pre=mean(Val[.pre <- Type=="pre"]), # save .pre post=mean(Val[.post <- Type=="post"]), # save .post pre.num=sum(.pre), # reuse .pre post.num=sum(.post), # reuse .post ParD = paste(ParD, collapse="_")), by=list(Par1, Par2)] ) dtMethod.MD2 <- quote( dt[, { .pre <- Type=="pre" # or save .pre and .post up front .post <- Type=="post" list(pre=mean(Val[.pre]), post=mean(Val[.post]), pre.num=sum(.pre), post.num=sum(.post), ParD = paste(ParD, collapse="_")) } , by=list(Par1, Par2)] ) dtMethod.MD3 <- quote( dt[, { .pre <- Type=="pre" .post <- Type=="post" list(pre=mean(Val[.pre]), post=mean(Val[.post]), pre.num=sum(.pre), post.num=sum(.post), ParD = list(ParD)) } # list() faster than paste() , by=list(Par1, Par2)] ) benchmark(dt.M1=eval(dtMethod.MD1), dt.M2=eval(dtMethod.MD2), dt.M3=eval(dtMethod.MD3), dt=eval(dtMethod), replications=5, columns=c("relative", "test", "elapsed", "user.self", "sys.self", "replications"), order="relative") #--------------------# Comparing the different data.table methods amongst themselves # SAMPLE SIZE = 900 relative test elapsed user.self sys.self replications 1.000 dt.M3 0.198 0.197 0.001 5 <~~~ "list()" Method 1.242 dt.M1 0.246 0.243 0.004 5 1.253 dt.M2 0.248 0.242 0.007 5 1.884 dt 0.373 0.367 0.007 5 # SAMPLE SIZE = 17,568 relative test elapsed user.self sys.self replications 1.000 dt.M3 33.492 24.487 9.122 5 <~~~ "list()" Method 1.086 dt.M1 36.388 11.442 25.086 5 1.086 dt.M2 36.388 10.845 25.660 5 1.126 dt 37.701 13.256 24.535 5 Comparing MD3 ("list" method) with MD1 (best of DT non-list methods) Using a clean session (ie, removing string cache) _Note: Ran the following twice, fresh session each time, with practically identical results Then re-ran in the *same* session, with reps=5. Results very different._ benchmark(dt.M1=eval(dtMethod.MD1), dt.M3=eval(dtMethod.MD3), replications=1, columns=c("relative", "test", "elapsed", "user.self", "sys.self", "replications"), order="relative") # SAMPLE SIZE=17,568; CLEAN SESSION relative test elapsed user.self sys.self replications 1.000 dt.M1 8.885 4.260 4.617 1 1.633 dt.M3 14.506 12.821 1.677 1 # SAMPLE SIZE=17,568; *SAME* SESSION relative test elapsed user.self sys.self replications 1.000 dt.M1 33.443 10.200 23.226 5 1.048 dt.M3 35.060 26.127 8.915 5 #--------------------# New benchmarks against previous methods _Note: Not using the "list method" as results are not the same as other methods_ # SAMPLE SIZE = 900 relative test elapsed user.self sys.self replications 1.000 dt.M1 0.254 0.247 0.008 5 1.705 reduce 0.433 0.425 0.010 5 11.280 plyr 2.865 2.842 0.031 5 # SAMPLE SIZE = 17,568 relative test elapsed user.self sys.self replications 1.000 dt.M1 24.826 10.427 14.458 5 4.348 reduce 107.935 70.107 38.314 5 5.942 plyr 147.508 106.958 41.083 5


Creo que esta solución de base R es comparable con la solución de tabla de datos de @ Arun. (Lo que no quiere decir que lo prefiera, ¡ese código es mucho más simple!)

baseMethod2 <- quote({ is <- unname(split(1:nrow(df), with(df, paste(Par1, Par2, sep="/b")))) i1 <- sapply(is, `[`, 1) out <- with(df, data.frame(Par1=Par1[i1], Par2=Par2[i1])) js <- lapply(is, function(i) split(i, df$Type[i])) out$post <- sapply(js, function(j) mean(df$Val[j$post])) out$pre <- sapply(js, function(j) mean(df$Val[j$pre])) out$Num.pre <- sapply(js, function(j) length(j$pre)) out$Num.post <- sapply(js, function(j) length(j$post)) out$ParD <- sapply(is, function(x) paste(df$ParD[x], collapse="_")) out })

Usando el código de tiempo de @ RicardoSaporta con 900, 2700 y 10,800, respectivamente:

> relative test elapsed user.self sys.self replications 3 1.000 baseMethod2 0.230 0.229 0 5 1 1.130 dt 0.260 0.257 0 5 2 8.752 plyr 2.013 2.006 0 5 > relative test elapsed user.self sys.self replications 3 1.000 baseMethod2 0.877 0.872 0 5 1 1.068 dt 0.937 0.934 0 5 2 8.060 plyr 7.069 7.043 0 5 > relative test elapsed user.self sys.self replications 1 1.000 dt 6.232 6.178 0.031 5 3 1.085 baseMethod2 6.763 6.683 0.054 5 2 7.263 plyr 45.261 44.983 0.104 5


Intentar envolver diferentes expresiones de agregación en una función autocontenida (las expresiones deberían producir valores atómicos)

multi.by <- function(X, INDEX,...) { expressions <- substitute(...()) duplicates <- duplicated(INDEX) res <- do.call(rbind,sapply(split(X,cumsum(!duplicates),drop=T), function(part) sapply(expressions,eval,part,simplify=F),simplify=F)) if (is.data.frame(INDEX)) res <- cbind(INDEX[!duplicates,],res) else rownames(res) <- INDEX[!duplicates] res } multi.by(df,df[,1:2], pre=mean(Val[Type=="pre"]), post=mean(Val[Type=="post"]), Num.pre=sum(Type=="pre"), Num.post=sum(Type=="post"), ParD=paste(ParD, collapse="_"))


Podría hacer una combinación de dos dcasts y un agregado, aquí todo envuelto en una gran expresión principalmente para evitar tener objetos intermedios dando vueltas después:

Reduce(merge, list( dcast(df, formula = Par1+Par2~Type, value.var="Val", fun.aggregate=mean), setNames(dcast(df, formula = Par1+Par2~Type, value.var="Val", fun.aggregate=length), c("Par1", "Par2", "Num.post", "Num.pre")), aggregate(df["ParD"], df[c("Par1", "Par2")], paste, collapse="_") ))


Solución One Step que combina reshape::cast con plyr::ddply

cast(Par1 + Par2 + ParD ~ Type, data = ddply(df, .(Par1, Par2), transform, ParD = paste(ParD, collapse = "_")), fun = c(mean, length))

TENGA dcast CUENTA que la función de reshape2 en reshape2 no permite que se pasen múltiples funciones agregadas, mientras que la función de reshape2 en reshape sí lo hace.


Solución en 2 pasos usando ddply (no estoy contento con, pero obtengo el resultado)

dat <- ddply(df,.(Par1,Par2),function(x){ data.frame(ParD=paste(paste(x$ParD),collapse=''_''), Num.pre =length(x$Type[x$Type ==''pre'']), Num.post = length(x$Type[x$Type ==''post''])) }) merge(dfw,dat) Par1 Par2 post pre ParD Num.pre Num.post 1 A D 2.0 1 foo_bar 1 1 2 B E 4.0 3 baz_qux 1 1 3 C F 6.5 5 bla_xyz_meh 1 2


Tarde a la fiesta, pero aquí hay otra alternativa usando data.table :

require(data.table) dt <- data.table(df, key=c("Par1", "Par2")) dt[, list(pre=mean(Val[Type == "pre"]), post=mean(Val[Type == "post"]), pre.num=length(Val[Type == "pre"]), post.num=length(Val[Type == "post"]), ParD = paste(ParD, collapse="_")), by=list(Par1, Par2)] # Par1 Par2 pre post pre.num post.num ParD # 1: A D 10 20 1 1 foo_bar # 2: B E 30 40 1 1 baz_qux # 3: C F 50 65 1 2 bla_xyz_meh

[de Matthew] +1 Algunas mejoras menores para guardar la repetición del mismo == , y para demostrar las variables locales dentro de j .

dt[, list(pre=mean(Val[.pre <- Type=="pre"]), # save .pre post=mean(Val[.post <- Type=="post"]), # save .post pre.num=sum(.pre), # reuse .pre post.num=sum(.post), # reuse .post ParD = paste(ParD, collapse="_")), by=list(Par1, Par2)] # Par1 Par2 pre post pre.num post.num ParD # 1: A D 10 20 1 1 foo_bar # 2: B E 30 40 1 1 baz_qux # 3: C F 50 65 1 2 bla_xyz_meh dt[, { .pre <- Type=="pre" # or save .pre and .post up front .post <- Type=="post" list(pre=mean(Val[.pre]), post=mean(Val[.post]), pre.num=sum(.pre), post.num=sum(.post), ParD = paste(ParD, collapse="_")) } , by=list(Par1, Par2)] # Par1 Par2 pre post pre.num post.num ParD # 1: A D 10 20 1 1 foo_bar # 2: B E 30 40 1 1 baz_qux # 3: C F 50 65 1 2 bla_xyz_meh

Y si una columna de list está bien en lugar de paste , entonces esto debería ser más rápido:

dt[, { .pre <- Type=="pre" .post <- Type=="post" list(pre=mean(Val[.pre]), post=mean(Val[.post]), pre.num=sum(.pre), post.num=sum(.post), ParD = list(ParD)) } # list() faster than paste() , by=list(Par1, Par2)] # Par1 Par2 pre post pre.num post.num ParD # 1: A D 10 20 1 1 foo,bar # 2: B E 30 40 1 1 baz,qux # 3: C F 50 65 1 2 bla,xyz,meh


Voy a publicar, pero Agstudy''s me avergüenza:

step1 <- with(df, split(df, list(Par1, Par2))) step2 <- step1[sapply(step1, nrow) > 0] step3 <- lapply(step2, function(x) { piece1 <- tapply(x$Val, x$Type, mean) piece2 <- tapply(x$Type, x$Type, length) names(piece2) <- paste0("Num.", names(piece2)) out <- x[1, 1:2] out[, 3:6] <- c(piece1, piece2) names(out)[3:6] <- names(c(piece1, piece2)) out$ParD <- paste(unique(x$ParD), collapse="_") out }) data.frame(do.call(rbind, step3), row.names=NULL)

Flexible:

Par1 Par2 post pre Num.post Num.pre ParD 1 A D 2.0 1 1 1 foo_bar 2 B E 4.0 3 1 1 baz_qux 3 C F 6.5 5 2 1 bla_xyz_meh