r split vectorization

Dividir columna de cadena para crear nuevas columnas binarias



split vectorization (7)

Al tener un problema similar, pero más complejo, pensé en la siguiente forma funcional que permite emitir de forma ''caliente'' los valores de caracteres separados por cadenas de una columna / s individualmente, así como las variables categóricas en sí:

oneHotOnText <- function(datatable, columns, seperator=", "){ #argument columns is character vector or numeric vector if(! "data.table" %in% .packages()) if(!require(data.table)) { install.packages("data.table"); library(data.table) } if(! "data.table" %in% class(datatable)) TempDT <- as.data.table(datatable) else TempDT <- copy(datatable) for(i in TempDT[, columns, with = F]){ if(class(i) != "character") i <- as.character(i) uniqueValues <- unique(unlist(strsplit(unique(i), split=seperator))) if(any(uniqueValues %in% names(TempDT))) { print("Value/s of the selected column/s is/are present as variables name/s. Rename it/them.") rm(TempDT) break } for(j in uniqueValues) TempDT[, (j) := ifelse(grepl(j, i), 1L, 0L)] } if(exists("TempDT")) return(TempDT) } DF = data.frame( aColumn=rep(c("f", "b", "c"), 100000), xColumn=rep(c("N/W", "W", "R"), 100000), yColumn=rep(c("A/B", "A/V", "B/G"), 100000), zColumn=rep(20:22, 100000)) str(DF) #factors are present in the data.frame oneHotOnText(DF, columns = c("aColumn", "xColumn", "yColumn"), seperator="/")[] #applies the function, returns a data.table and prints the result # aColumn xColumn yColumn zColumn f b c N W R A B V G # 1: f N/W A/B 20 1 0 0 1 1 0 1 1 0 0 # 2: b W A/V 21 0 1 0 0 1 0 1 0 1 0 # 3: c R B/G 22 0 0 1 0 0 1 0 1 0 1 # 4: f N/W A/B 20 1 0 0 1 1 0 1 1 0 0 # 5: b W A/V 21 0 1 0 0 1 0 1 0 1 0 # --- #299996: b W A/V 21 0 1 0 0 1 0 1 0 1 0 #299997: c R B/G 22 0 0 1 0 0 1 0 1 0 1 #299998: f N/W A/B 20 1 0 0 1 1 0 1 1 0 0 #299999: b W A/V 21 0 1 0 0 1 0 1 0 1 0 #300000: c R B/G 22 0 0 1 0 0 1 0 1 0 1

Del mismo modo, se aplica al problema del OP:

input <- data.frame(ALL = c("/ca/put/sent_1/fe.gr/eq2_on/eq2_off", "/ca/put/sent_1/fe.gr/eq2_on/eq2_off/hi.on/hi.ov", "/ca/put/sent_1/fe.gr/eq2_on/eq2_off/hi.on/hi.ov", "/ca/put/sent_1/fe.gr/eq2_on/eq2_off/cni_at.p3x.4", "/ca/put/sent_1/fe.gr/eq2_on/eq2_off/cbr_LBL" )) oneHotOnText(input, columns = "ALL", seperator = "/")[]

Mis datos tienen una columna y estoy tratando de crear columnas adicionales con lo que está después de cada "/" en las filas. Aquí están las primeras filas de los datos:

> dput(mydata) structure(list(ALL = structure(c(1L, 4L, 4L, 3L, 2L), .Label = c("/ ca/put/sent_1/fe.gr/eq2_on/eq2_off", "/ca/put/sent_1/fe.gr/eq2_on/eq2_off/cbr_LBL", "/ca/put/sent_1/fe.g r/eq2_on/eq2_off/cni_at.p3x.4", "/ca/put/sent_1/fe.gr/eq2_on/eq2_off/hi.on/hi.ov"), class = "factor ")), .Names = "ALL", class = "data.frame", row.names = c(NA, -5L))

El resultado debería verse así (marco de datos) con un "1" en la nueva columna si la variable aparece en la fila y "0" si no:

> dput(Result) structure(list(ALL = structure(c(1L, 4L, 5L, 3L, 2L), .Label = c("/ca /put/sent_1/fe.gr/eq2_on/eq2_off", "/ca/put/sent_1/fe.gr/eq2_on/eq2_off/cbr_LBL", "/ca/put/sent_1/fe.gr/ eq2_on/eq2_off/cni_at.p3x.4", "/ca/put/sent_1/fe.gr/eq2_on/eq2_off/hi.on/hi.ov", "/ca/put/sent_1fe. gr/eq2_on/eq2_off/hi.on/hi.ov" ), class = "factor"), ca = c(1L, 1L, 1L, 1L, 1L), put = c(1L, 1L, 1L, 1L, 1L), sent_1 = c(1L, 1L, 1L, 1L, 1L), fe.gr = c(1L, 1L, 1L, 1L, 1L), eq2_on = c(1L, 1L, 1L, 1L, 1L), eq2_off = c(1L, 1L, 1L, 1L, 1L), hi.on = c(0L, 1L, 1L, 0L, 0L), hi.ov = c(0L, 1L, 1L, 0L, 0L), cni_at.p3x.4 = c(0L, 0L, 0L, 1L, 0L), cbr_LBL = c(0L , 0L, 0L, 0L, 1L)), .Names = c("ALL", "ca", "put", "sent_1", "fe.gr", "eq2_on", "eq2_off", "hi.on", "hi.ov", "cni_at.p3x.4", "cbr_LBL" ), class = "data.frame", row.names = c(NA, -5L))

He probado muchas funciones, incluyendo strsplit y sapply:

sapply(strsplit(as.character(mydata$ALL), “///”), “[[“, 2) #returns "ca"s only sapply(strsplit(as.character(mydata$ALL), "///"), "[[", 3) #returns "put"s only

Hay millones de filas y agradecería mucho cualquier cosa que sea rápida y eficiente.


Aquí hay una solución que usa dplyr y tidyr (nota: limpié lo que parece faltar / en la fila tres de sus datos de muestra):

## Input input <- structure( list(ALL = structure(c(1L, 4L, 5L, 3L, 2L), .Label = c("/ca/put/sent_1/fe.gr/eq2_on/eq2_off", "/ca/put/sent_1/fe.gr/eq2_on/eq2_off/cbr_LBL", "/ca/put/sent_1/fe.gr/eq2_on/eq2_off/cni_at.p3x.4", "/ca/put/sent_1/fe.gr/eq2_on/eq2_off/hi.on/hi.ov", "/ca/put/sent_1/fe.gr/eq2_on/eq2_off/hi.on/hi.ov"), class = "factor")), .Names = "ALL", class = "data.frame", row.names = c(NA, -5L)) ## Solution require(dplyr) require(tidyr) solution <- input %>% mutate(temp = sub("^/", "", ALL)) %>% separate(temp, c("ca", "put", "sent_1", "fe.gr", "eq2_on", "eq2_off", "hi.on", "hi.ov", "cni_at.p3x.4", "cbr_LBL"), "/", extra="merge") %>% mutate_each(funs(as.numeric(!is.na(.))), -ALL)


Otra opción es melt la cadena split en la list en forma long y luego usar la table

library(reshape2) as.data.frame.matrix(table(melt(strsplit(as.character( mydata[[1]]), "/"))[2:1]))[,-1] # ca eq2_off eq2_on fe.gr put sent_1 hi.on hi.ov sent_1fe.gr cni_at.p3x.4 #1 1 1 1 1 1 1 0 0 0 0 #2 1 1 1 1 1 1 1 1 0 0 #3 1 1 1 0 1 0 1 1 1 0 #4 1 1 1 1 1 1 0 0 0 1 #5 1 1 1 1 1 1 0 0 0 0 # cbr_LBL #1 0 #2 0 #3 0 #4 0 #5 1


Puede usar cSplit_e de mi paquete "splitstackshape":

library(splitstackshape) cSplit_e(mydata, "ALL", "/", type = "character", fill = 0) # ALL ALL_ca ALL_cbr_LBL # 1 /ca/put/sent_1/fe.gr/eq2_on/eq2_off 1 0 # 2 /ca/put/sent_1/fe.gr/eq2_on/eq2_off/hi.on/hi.ov 1 0 # 3 /ca/put/sent_1fe.gr/eq2_on/eq2_off/hi.on/hi.ov 1 0 # 4 /ca/put/sent_1/fe.gr/eq2_on/eq2_off/cni_at.p3x.4 1 0 # 5 /ca/put/sent_1/fe.gr/eq2_on/eq2_off/cbr_LBL 1 1 # ALL_cni_at.p3x.4 ALL_eq2_off ALL_eq2_on ALL_fe.gr ALL_hi.on ALL_hi.ov ALL_put # 1 0 1 1 1 0 0 1 # 2 0 1 1 1 1 1 1 # 3 0 1 1 0 1 1 1 # 4 1 1 1 1 0 0 1 # 5 0 1 1 1 0 0 1 # ALL_sent_1 ALL_sent_1fe.gr # 1 1 0 # 2 1 0 # 3 0 1 # 4 1 0 # 5 1 0

(Nota: creo que hay un problema en la fila 3 de su dput por lo que no coincide con la salida deseada. Observe que el tercer elemento en la fila 3 es "sent_1fe.gr" sin "/" entre ellos).


Qué tal algo como esto

spt <- strsplit(as.character(mydata$ALL),"/", fixed=T) do.call(rbind, lapply(lapply(spt, factor, levels=unique(unlist(spt))), table))

que vuelve

ca put sent_1 fe.gr eq2_on eq2_off hi.on hi.ov sent_1fe.gr cni_at.p3x.4 cbr_LBL [1,] 1 1 1 1 1 1 1 0 0 0 0 0 [2,] 1 1 1 1 1 1 1 1 1 0 0 0 [3,] 1 1 1 1 0 1 1 1 1 1 0 0 [4,] 1 1 1 1 1 1 1 0 0 0 1 0 [5,] 1 1 1 1 1 1 1 0 0 0 0 1


Usando mtabuate del paquete qdapTools que mantengo:

library(qdapTools) mtabulate(strsplit(as.character(dat[[1]]), "/")) ## V1 ca cbr_LBL cni_at.p3x.4 eq2_off eq2_on fe.gr hi.on hi.ov put sent_1 sent_1fe.gr ## 1 1 1 0 0 1 1 1 0 0 1 1 0 ## 2 1 1 0 0 1 1 1 1 1 1 1 0 ## 3 1 1 0 0 1 1 0 1 1 1 0 1 ## 4 1 1 0 1 1 1 1 0 0 1 1 0 ## 5 1 1 1 0 1 1 1 0 0 1 1 0


una solución tidyverse

library(tidyverse) mydata %>% rownames_to_column() %>% mutate(key = strsplit(levels(ALL)[ALL],"/"),value=1) %>% unnest %>% spread(key,value,0) %>% select(-rowname) # ALL ca cbr_LBL cni_at.p3x.4 eq2_off eq2_on fe.gr hi.on hi.ov put sent_1 # 1 1 1 1 0 0 1 1 1 0 0 1 1 # 2 4 1 1 0 0 1 1 1 1 1 1 1 # 3 4 1 1 0 0 1 1 1 1 1 1 1 # 4 3 1 1 0 1 1 1 1 0 0 1 1 # 5 2 1 1 1 0 1 1 1 0 0 1 1

datos

mydata <- structure(list(ALL = structure(c(1L, 4L, 4L, 3L, 2L), .Label = c( "/ca/put/sent_1/fe.gr/eq2_on/eq2_off", "/ca/put/sent_1/fe.gr/eq2_on/eq2_off/cbr_LBL", "/ca/put/sent_1/fe.gr/eq2_on/eq2_off/cni_at.p3x.4", "/ca/put/sent_1/fe.gr/eq2_on/eq2_off/hi.on/hi.ov"), class = "factor ")), .Names = "ALL", class = "data.frame", row.names = c(NA,-5L))