data.table interpolando linealmente valores de NA sin grupos

(5)

Quería completar algunos valores de NA en una tabla de datos sin grupos. Considere este extracto de data.table que representa el tiempo y las distancias:

library(data.table) df <- data.frame(time = seq(7173, 7195, 1), dist = c(31091.33, NA, 31100.00, 31103.27, NA, NA, NA, NA, 31124.98, NA,31132.81, NA, NA, NA, NA, 31154.19, NA, 31161.47, NA, NA, NA, NA, 31182.97)) DT<- data.table(df)

Quiero en DT data.table, para llenar los valores de NA con una función que depende del valor que no sea NA antes / después. Como ejemplo, escribir una función en j para reemplazar cada instrucción

DT[2, dist := (31091.33 + (31100-31091.33) / 2)]

luego

DT[5:8, dist := (31103.27 + "something" * (31124.98 - 31103.27) / 5)]

etc ...

2 otras opciones:

1) junta rodante:

DT[is.na(dist), dist := { x0y0 <- DT[!is.na(dist)][.SD, on=.(time), roll=Inf, .(time=x.time, dist=x.dist)] x1y1 <- DT[!is.na(dist)][.SD, on=.(time), roll=-Inf, .(time=x.time, dist=x.dist)] (x1y1$dist - x0y0$dist) / (x1y1$time - x0y0$time) * (time - x0y0$time) + x0y0$dist }] DT

2) otra variante cercana de respuesta nafill usando nafill

DT[, dist := { y0 <- nafill(dist, "locf") x0 <- nafill(replace(time, is.na(dist), NA), "locf") y1 <- nafill(dist, "nocb") x1 <- nafill(replace(time, is.na(dist), NA), "nocb") fifelse(is.na(dist), (y1 - y0) / (x1 - x0) * (time - x0) + y0, dist) }]

código de tiempo:

library(data.table) set.seed(0L) # df=data.frame(time=seq(7173,7195,1),dist=c(31091.33,NA,31100.00,31103.27,NA,NA,NA,NA,31124.98,NA,31132.81,NA,NA,NA,NA,31154.19,NA,31161.47,NA,NA,NA,NA,31182.97)) # DT=data.table(df) nr <- 1e6 nNA <- nr/2 DT <- data.table(time=1:nr, dist=replace(rnorm(nr), sample(1:nr, nNA), NA_real_)) DT00 <- copy(DT) DT01 <- copy(DT) DT1 <- copy(DT) DT20 <- copy(DT) DT21 <- copy(DT) mtd00 <- function() { DT00[, g := rleid(is.na(dist))] DT00[is.na(dist), dist := { i <- .I[c(1, .N)] + c(-1, 1) DT00[i[1]:i[2], approx(dist, n = .N)$y[-c(1, .N)]] }, by = g] } mtd01 <- function() { DT01[, g := rleid(is.na(dist))] DT01[is.na(dist), dist := { i <- .I[c(1, .N)] + c(-1, 1) DT01[i[1]:i[2], dist[1] + 1:(.N - 2)*(dist[.N] - dist[1])/(.N - 1)] }, by = g] } mtd1 <- function() { DT1[,dist_before := nafill(dist, "locf")] DT1[,dist_after := nafill(dist, "nocb")] DT1[, rle := rleid(dist)][,missings := max(.N + 1 , 2), by = rle][] DT1[is.na(dist), dist := dist_before + .SD[,.I] * (dist_after - dist_before)/(missings), by = rle] } mtd20 <- function() { DT20[is.na(dist), dist := { x0y0 <- DT[!is.na(dist)][.SD, on=.(time), roll=Inf, .(time=x.time, dist=x.dist)] x1y1 <- DT[!is.na(dist)][.SD, on=.(time), roll=-Inf, .(time=x.time, dist=x.dist)] (x1y1$dist - x0y0$dist) / (x1y1$time - x0y0$time) * (time - x0y0$time) + x0y0$dist }] } mtd21 <- function() { DT21[, dist := { y0 <- nafill(dist, "locf") x0 <- nafill(replace(time, is.na(dist), NA), "locf") y1 <- nafill(dist, "nocb") x1 <- nafill(replace(time, is.na(dist), NA), "nocb") fifelse(is.na(dist), (y1 - y0) / (x1 - x0) * (time - x0) + y0, dist) }] } bench::mark(#mtd00(), mtd01(), mtd1(), mtd20(), mtd21(), check=FALSE) # all.equal(mtd01()$dist, mtd1()$dist) #[1] TRUE # all.equal(mtd20()$dist, mtd1()$dist) #[1] TRUE # all.equal(mtd21()$dist, mtd1()$dist) #[1] TRUE

tiempos:

# A tibble: 3 x 13 expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list> 1 mtd1() 834.6ms 834.6ms 1.20 7.91GB 2.40 1 2 834.57ms <df[,6] [1,000,000 x 6]> <df[,3] [1,814,567 x 3]> <bch:tm> <tibble [1 x 3]> 2 mtd20() 40.2ms 42.4ms 19.6 99.53MB 3.92 10 2 509.89ms <df[,2] [1,000,000 x 2]> <df[,3] [301 x 3]> <bch:tm> <tibble [10 x 3]> 3 mtd21() 37.5ms 60.1ms 3.89 76.46MB 1.95 8 4 2.05s <df[,2] [1,000,000 x 2]> <df[,3] [225 x 3]> <bch:tm> <tibble [8 x 3]>

editar: para abordar el comentario sobre el uso de is.na(dist) varias veces:

set.seed(0L) nr <- 1e7 nNA <- nr/2 DT <- data.table(time=1:nr, dist=replace(rnorm(nr), sample(1:nr, nNA), NA_real_)) DT20 <- copy(DT) DT201 <- copy(DT) DT202 <- copy(DT) mtd20 <- function() { DT20[is.na(dist), dist := { x0y0 <- DT20[!is.na(dist)][.SD, on=.(time), roll=Inf, .(time=x.time, dist=x.dist)] x1y1 <- DT20[!is.na(dist)][.SD, on=.(time), roll=-Inf, .(time=x.time, dist=x.dist)] (x1y1$dist - x0y0$dist) / (x1y1$time - x0y0$time) * (time - x0y0$time) + x0y0$dist }] } mtd201 <- function() { i <- DT201[, is.na(dist)] DT201[(i), dist := { x0y0 <- DT201[(!i)][.SD, on=.(time), roll=Inf, .(time=x.time, dist=x.dist)] x1y1 <- DT201[(!i)][.SD, on=.(time), roll=-Inf, .(time=x.time, dist=x.dist)] (x1y1$dist - x0y0$dist) / (x1y1$time - x0y0$time) * (time - x0y0$time) + x0y0$dist }] } mtd202 <- function() { i <- DT201[is.na(dist), which=TRUE] DT201[i, dist := { x0y0 <- DT201[-i][.SD, on=.(time), roll=Inf, .(time=x.time, dist=x.dist)] x1y1 <- DT201[-i][.SD, on=.(time), roll=-Inf, .(time=x.time, dist=x.dist)] (x1y1$dist - x0y0$dist) / (x1y1$time - x0y0$time) * (time - x0y0$time) + x0y0$dist }] }

tiempos:

# A tibble: 3 x 13 expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list> 1 mtd20() 24.1ms 25.8ms 37.5 1.01GB 13.6 11 4 294ms <df[,2] [10,000,000 x 2]> <df[,3] [310 x 3]> <bch:tm> <tibble [15 x 3]> 2 mtd201() 24.8ms 25.6ms 38.2 954.07MB 8.19 14 3 366ms <df[,2] [10,000,000 x 2]> <df[,3] [398 x 3]> <bch:tm> <tibble [17 x 3]> 3 mtd202() 24ms 25.6ms 38.3 76.39MB 8.22 14 3 365ms <df[,2] [10,000,000 x 2]> <df[,3] [241 x 3]> <bch:tm> <tibble [17 x 3]>

No se ven muchas diferencias en los tiempos cuando se reduce la cantidad de is.na(dist)

Aquí hay un enfoque rcpp que recorre todo una vez con pase adicional para todos los elementos de NA.

#include <Rcpp.h> using namespace Rcpp; // Enable C++11 via this plugin (Rcpp 0.10.3 or later) // [[Rcpp::plugins(cpp11)]] // [[Rcpp::export]] NumericVector rcpp_approx(NumericVector y) { double start = 0; double slope = 0; int count = 0; int first_non_na = 0; NumericVector x = clone(y); //added to not update-by-reference //need to handle vectors that start with NA. This will return "NA" for(int k = 0; k < x.size(); k++) { if (!NumericVector::is_na(x[k])) break; first_non_na = k + 1; } //iterator seems similar to the idea of *apply with the added benefit // that we can reference relative to the element being acted on. for(NumericVector::iterator i = std::next(x.begin(), first_non_na); i != x.end(); ++i){ if (NumericVector::is_na(*i)){ count++; } else { if (count != 0) { start = *(i-(count+1)); slope = (*i - start) / (count + 1); for (int j = 0; j < count; j++){ *(i-(count-j)) = start + slope * (j + 1); } count = 0; } } } return(x); }

Luego en R:

DT[, rcpp_approx(dist)]

El código se explica en línea. Puede eliminar las columnas temporales usando df[,dist_before := NULL] , por ejemplo.

library(data.table) df=data.table(time=seq(7173,7195,1),dist=c(31091.33,NA,31100.00,31103.27,NA,NA,NA, NA,31124.98,NA,31132.81,NA,NA,NA,NA,31154.19,NA,31161.47,NA,NA,NA,NA,31182.97)) df #> time dist #> 1: 7173 31091.33 #> 2: 7174 NA #> 3: 7175 31100.00 #> 4: 7176 31103.27 #> 5: 7177 NA #> 6: 7178 NA #> 7: 7179 NA #> 8: 7180 NA #> 9: 7181 31124.98 #> 10: 7182 NA #> 11: 7183 31132.81 #> 12: 7184 NA #> 13: 7185 NA #> 14: 7186 NA #> 15: 7187 NA #> 16: 7188 31154.19 #> 17: 7189 NA #> 18: 7190 31161.47 #> 19: 7191 NA #> 20: 7192 NA #> 21: 7193 NA #> 22: 7194 NA #> 23: 7195 31182.97 #> time dist # Carry forward the last non-missing observation df[,dist_before := nafill(dist, "locf")] # Bring back the next non-missing dist df[,dist_after := nafill(dist, "nocb")] # rleid will create groups based on run-lengths of values within the data. # This means 4 NA''s in a row will be grouped together, for example. # We then count the missings and add 1, because we want the # last NA before the next non-missing to be less than the non-missing value. df[, rle := rleid(dist)][,missings := max(.N + 1 , 2), by = rle][] #> time dist dist_before dist_after rle missings #> 1: 7173 31091.33 31091.33 31091.33 1 2 #> 2: 7174 NA 31091.33 31100.00 2 2 #> 3: 7175 31100.00 31100.00 31100.00 3 2 #> 4: 7176 31103.27 31103.27 31103.27 4 2 #> 5: 7177 NA 31103.27 31124.98 5 5 #> 6: 7178 NA 31103.27 31124.98 5 5 #> 7: 7179 NA 31103.27 31124.98 5 5 #> 8: 7180 NA 31103.27 31124.98 5 5 #> 9: 7181 31124.98 31124.98 31124.98 6 2 #> 10: 7182 NA 31124.98 31132.81 7 2 #> 11: 7183 31132.81 31132.81 31132.81 8 2 #> 12: 7184 NA 31132.81 31154.19 9 5 #> 13: 7185 NA 31132.81 31154.19 9 5 #> 14: 7186 NA 31132.81 31154.19 9 5 #> 15: 7187 NA 31132.81 31154.19 9 5 #> 16: 7188 31154.19 31154.19 31154.19 10 2 #> 17: 7189 NA 31154.19 31161.47 11 2 #> 18: 7190 31161.47 31161.47 31161.47 12 2 #> 19: 7191 NA 31161.47 31182.97 13 5 #> 20: 7192 NA 31161.47 31182.97 13 5 #> 21: 7193 NA 31161.47 31182.97 13 5 #> 22: 7194 NA 31161.47 31182.97 13 5 #> 23: 7195 31182.97 31182.97 31182.97 14 2 #> time dist dist_before dist_after rle missings # .SD[,.I] will get us the row number relative to the group it is in. # For example, row 5 dist is calculated as # dist_before + 1 * (dist_after - dist_before)/5 df[is.na(dist), dist := dist_before + .SD[,.I] * (dist_after - dist_before)/(missings), by = rle] df[] #> time dist dist_before dist_after rle missings #> 1: 7173 31091.33 31091.33 31091.33 1 2 #> 2: 7174 31095.67 31091.33 31100.00 2 2 #> 3: 7175 31100.00 31100.00 31100.00 3 2 #> 4: 7176 31103.27 31103.27 31103.27 4 2 #> 5: 7177 31107.61 31103.27 31124.98 5 5 #> 6: 7178 31111.95 31103.27 31124.98 5 5 #> 7: 7179 31116.30 31103.27 31124.98 5 5 #> 8: 7180 31120.64 31103.27 31124.98 5 5 #> 9: 7181 31124.98 31124.98 31124.98 6 2 #> 10: 7182 31128.90 31124.98 31132.81 7 2 #> 11: 7183 31132.81 31132.81 31132.81 8 2 #> 12: 7184 31137.09 31132.81 31154.19 9 5 #> 13: 7185 31141.36 31132.81 31154.19 9 5 #> 14: 7186 31145.64 31132.81 31154.19 9 5 #> 15: 7187 31149.91 31132.81 31154.19 9 5 #> 16: 7188 31154.19 31154.19 31154.19 10 2 #> 17: 7189 31157.83 31154.19 31161.47 11 2 #> 18: 7190 31161.47 31161.47 31161.47 12 2 #> 19: 7191 31165.77 31161.47 31182.97 13 5 #> 20: 7192 31170.07 31161.47 31182.97 13 5 #> 21: 7193 31174.37 31161.47 31182.97 13 5 #> 22: 7194 31178.67 31161.47 31182.97 13 5 #> 23: 7195 31182.97 31182.97 31182.97 14 2 #> time dist dist_before dist_after rle missings

Puede usar la función approx para hacer interpolación lineal.

Para cada grupo de NA s, obtenga ese subconjunto de DT más las filas antes y después. Luego aplique approx a este subconjunto del vector dist , con el argumento n de approx igual al número de filas en el subconjunto .N .

DT[, g := rleid(dist)] DT[is.na(dist), dist := { i <- .I[c(1, .N)] + c(-1, 1) DT[i[1]:i[2], approx(dist, n = .N)$y[-c(1, .N)]] }, by = g]

O sin approx

DT[, g := rleid(dist)] DT[is.na(dist), dist := { i <- .I[c(1, .N)] + c(-1, 1) DT[i[1]:i[2], dist[1] + 1:(.N - 2)*(dist[.N] - dist[1])/(.N - 1)] }, by = g]

Usando la library(zoo)

DT[, dist := na.approx(dist)]

Alternativamente, si prefiere apegarse a las funciones base R en lugar de usar otro paquete, puede hacerlo

DT[, dist := approx(.I, dist, .I)$y]