Dividir columna de cadena para crear nuevas columnas binarias
split vectorization (7)
Al tener un problema similar, pero más complejo, pensé en la siguiente forma funcional que permite emitir de forma ''caliente'' los valores de caracteres separados por cadenas de una columna / s individualmente, así como las variables categóricas en sí:
oneHotOnText <- function(datatable, columns, seperator=", "){ #argument columns is character vector or numeric vector
if(! "data.table" %in% .packages()) if(!require(data.table)) { install.packages("data.table"); library(data.table) }
if(! "data.table" %in% class(datatable)) TempDT <- as.data.table(datatable) else TempDT <- copy(datatable)
for(i in TempDT[, columns, with = F]){
if(class(i) != "character") i <- as.character(i)
uniqueValues <- unique(unlist(strsplit(unique(i), split=seperator)))
if(any(uniqueValues %in% names(TempDT))) { print("Value/s of the selected column/s is/are present as variables name/s. Rename it/them.")
rm(TempDT)
break }
for(j in uniqueValues) TempDT[, (j) := ifelse(grepl(j, i), 1L, 0L)]
}
if(exists("TempDT")) return(TempDT)
}
DF = data.frame(
aColumn=rep(c("f", "b", "c"), 100000),
xColumn=rep(c("N/W", "W", "R"), 100000),
yColumn=rep(c("A/B", "A/V", "B/G"), 100000),
zColumn=rep(20:22, 100000))
str(DF) #factors are present in the data.frame
oneHotOnText(DF, columns = c("aColumn", "xColumn", "yColumn"), seperator="/")[] #applies the function, returns a data.table and prints the result
# aColumn xColumn yColumn zColumn f b c N W R A B V G
# 1: f N/W A/B 20 1 0 0 1 1 0 1 1 0 0
# 2: b W A/V 21 0 1 0 0 1 0 1 0 1 0
# 3: c R B/G 22 0 0 1 0 0 1 0 1 0 1
# 4: f N/W A/B 20 1 0 0 1 1 0 1 1 0 0
# 5: b W A/V 21 0 1 0 0 1 0 1 0 1 0
# ---
#299996: b W A/V 21 0 1 0 0 1 0 1 0 1 0
#299997: c R B/G 22 0 0 1 0 0 1 0 1 0 1
#299998: f N/W A/B 20 1 0 0 1 1 0 1 1 0 0
#299999: b W A/V 21 0 1 0 0 1 0 1 0 1 0
#300000: c R B/G 22 0 0 1 0 0 1 0 1 0 1
Del mismo modo, se aplica al problema del OP:
input <- data.frame(ALL = c("/ca/put/sent_1/fe.gr/eq2_on/eq2_off",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/hi.on/hi.ov",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/hi.on/hi.ov",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/cni_at.p3x.4",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/cbr_LBL"
))
oneHotOnText(input, columns = "ALL", seperator = "/")[]
Mis datos tienen una columna y estoy tratando de crear columnas adicionales con lo que está después de cada "/" en las filas. Aquí están las primeras filas de los datos:
> dput(mydata)
structure(list(ALL = structure(c(1L, 4L, 4L, 3L, 2L), .Label = c("/
ca/put/sent_1/fe.gr/eq2_on/eq2_off",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/cbr_LBL", "/ca/put/sent_1/fe.g
r/eq2_on/eq2_off/cni_at.p3x.4",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/hi.on/hi.ov"), class = "factor
")), .Names = "ALL", class = "data.frame", row.names = c(NA,
-5L))
El resultado debería verse así (marco de datos) con un "1" en la nueva columna si la variable aparece en la fila y "0" si no:
> dput(Result)
structure(list(ALL = structure(c(1L, 4L, 5L, 3L, 2L), .Label = c("/ca
/put/sent_1/fe.gr/eq2_on/eq2_off",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/cbr_LBL", "/ca/put/sent_1/fe.gr/
eq2_on/eq2_off/cni_at.p3x.4",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/hi.on/hi.ov", "/ca/put/sent_1fe.
gr/eq2_on/eq2_off/hi.on/hi.ov"
), class = "factor"), ca = c(1L, 1L, 1L, 1L, 1L), put = c(1L,
1L, 1L, 1L, 1L), sent_1 = c(1L, 1L, 1L, 1L, 1L), fe.gr = c(1L,
1L, 1L, 1L, 1L), eq2_on = c(1L, 1L, 1L, 1L, 1L), eq2_off = c(1L,
1L, 1L, 1L, 1L), hi.on = c(0L, 1L, 1L, 0L, 0L), hi.ov = c(0L,
1L, 1L, 0L, 0L), cni_at.p3x.4 = c(0L, 0L, 0L, 1L, 0L), cbr_LBL = c(0L
,
0L, 0L, 0L, 1L)), .Names = c("ALL", "ca", "put", "sent_1", "fe.gr",
"eq2_on", "eq2_off", "hi.on", "hi.ov", "cni_at.p3x.4", "cbr_LBL"
), class = "data.frame", row.names = c(NA, -5L))
He probado muchas funciones, incluyendo strsplit y sapply:
sapply(strsplit(as.character(mydata$ALL), “///”), “[[“, 2) #returns "ca"s only
sapply(strsplit(as.character(mydata$ALL), "///"), "[[", 3) #returns "put"s only
Hay millones de filas y agradecería mucho cualquier cosa que sea rápida y eficiente.
Aquí hay una solución que usa dplyr y tidyr (nota: limpié lo que parece faltar / en la fila tres de sus datos de muestra):
## Input
input <- structure(
list(ALL = structure(c(1L, 4L, 5L, 3L, 2L),
.Label = c("/ca/put/sent_1/fe.gr/eq2_on/eq2_off",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/cbr_LBL",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/cni_at.p3x.4",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/hi.on/hi.ov",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/hi.on/hi.ov"),
class = "factor")),
.Names = "ALL", class = "data.frame", row.names = c(NA, -5L))
## Solution
require(dplyr)
require(tidyr)
solution <- input %>%
mutate(temp = sub("^/", "", ALL)) %>%
separate(temp,
c("ca", "put", "sent_1", "fe.gr", "eq2_on", "eq2_off",
"hi.on", "hi.ov", "cni_at.p3x.4", "cbr_LBL"),
"/", extra="merge") %>%
mutate_each(funs(as.numeric(!is.na(.))), -ALL)
Otra opción es
melt
la cadena
split
en la
list
en forma
long
y luego usar la
table
library(reshape2)
as.data.frame.matrix(table(melt(strsplit(as.character(
mydata[[1]]), "/"))[2:1]))[,-1]
# ca eq2_off eq2_on fe.gr put sent_1 hi.on hi.ov sent_1fe.gr cni_at.p3x.4
#1 1 1 1 1 1 1 0 0 0 0
#2 1 1 1 1 1 1 1 1 0 0
#3 1 1 1 0 1 0 1 1 1 0
#4 1 1 1 1 1 1 0 0 0 1
#5 1 1 1 1 1 1 0 0 0 0
# cbr_LBL
#1 0
#2 0
#3 0
#4 0
#5 1
Puede usar
cSplit_e
de mi paquete "splitstackshape":
library(splitstackshape)
cSplit_e(mydata, "ALL", "/", type = "character", fill = 0)
# ALL ALL_ca ALL_cbr_LBL
# 1 /ca/put/sent_1/fe.gr/eq2_on/eq2_off 1 0
# 2 /ca/put/sent_1/fe.gr/eq2_on/eq2_off/hi.on/hi.ov 1 0
# 3 /ca/put/sent_1fe.gr/eq2_on/eq2_off/hi.on/hi.ov 1 0
# 4 /ca/put/sent_1/fe.gr/eq2_on/eq2_off/cni_at.p3x.4 1 0
# 5 /ca/put/sent_1/fe.gr/eq2_on/eq2_off/cbr_LBL 1 1
# ALL_cni_at.p3x.4 ALL_eq2_off ALL_eq2_on ALL_fe.gr ALL_hi.on ALL_hi.ov ALL_put
# 1 0 1 1 1 0 0 1
# 2 0 1 1 1 1 1 1
# 3 0 1 1 0 1 1 1
# 4 1 1 1 1 0 0 1
# 5 0 1 1 1 0 0 1
# ALL_sent_1 ALL_sent_1fe.gr
# 1 1 0
# 2 1 0
# 3 0 1
# 4 1 0
# 5 1 0
(Nota: creo que hay un problema en la fila 3 de su
dput
por lo que no coincide con la salida deseada. Observe que el tercer elemento en la fila 3 es "sent_1fe.gr" sin "/" entre ellos).
Qué tal algo como esto
spt <- strsplit(as.character(mydata$ALL),"/", fixed=T)
do.call(rbind, lapply(lapply(spt, factor, levels=unique(unlist(spt))), table))
que vuelve
ca put sent_1 fe.gr eq2_on eq2_off hi.on hi.ov sent_1fe.gr cni_at.p3x.4 cbr_LBL
[1,] 1 1 1 1 1 1 1 0 0 0 0 0
[2,] 1 1 1 1 1 1 1 1 1 0 0 0
[3,] 1 1 1 1 0 1 1 1 1 1 0 0
[4,] 1 1 1 1 1 1 1 0 0 0 1 0
[5,] 1 1 1 1 1 1 1 0 0 0 0 1
Usando
mtabuate
del paquete
qdapTools
que mantengo:
library(qdapTools)
mtabulate(strsplit(as.character(dat[[1]]), "/"))
## V1 ca cbr_LBL cni_at.p3x.4 eq2_off eq2_on fe.gr hi.on hi.ov put sent_1 sent_1fe.gr
## 1 1 1 0 0 1 1 1 0 0 1 1 0
## 2 1 1 0 0 1 1 1 1 1 1 1 0
## 3 1 1 0 0 1 1 0 1 1 1 0 1
## 4 1 1 0 1 1 1 1 0 0 1 1 0
## 5 1 1 1 0 1 1 1 0 0 1 1 0
una solución
tidyverse
library(tidyverse)
mydata %>%
rownames_to_column() %>%
mutate(key = strsplit(levels(ALL)[ALL],"/"),value=1) %>%
unnest %>%
spread(key,value,0) %>%
select(-rowname)
# ALL ca cbr_LBL cni_at.p3x.4 eq2_off eq2_on fe.gr hi.on hi.ov put sent_1
# 1 1 1 1 0 0 1 1 1 0 0 1 1
# 2 4 1 1 0 0 1 1 1 1 1 1 1
# 3 4 1 1 0 0 1 1 1 1 1 1 1
# 4 3 1 1 0 1 1 1 1 0 0 1 1
# 5 2 1 1 1 0 1 1 1 0 0 1 1
datos
mydata <- structure(list(ALL = structure(c(1L, 4L, 4L, 3L, 2L), .Label = c(
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/cbr_LBL",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/cni_at.p3x.4",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/hi.on/hi.ov"), class = "factor
")), .Names = "ALL", class = "data.frame", row.names = c(NA,-5L))