data.table interpolando linealmente valores de NA sin grupos
(5)
Quería completar algunos valores de NA en una tabla de datos sin grupos. Considere este extracto de data.table que representa el tiempo y las distancias:
library(data.table)
df <- data.frame(time = seq(7173, 7195, 1), dist = c(31091.33, NA, 31100.00, 31103.27, NA, NA, NA, NA, 31124.98, NA,31132.81, NA, NA, NA, NA, 31154.19, NA, 31161.47, NA, NA, NA, NA, 31182.97))
DT<- data.table(df)
Quiero en DT data.table, para llenar los valores de NA con una función que depende del valor que no sea NA antes / después. Como ejemplo, escribir una función en j para reemplazar cada instrucción
DT[2, dist := (31091.33 + (31100-31091.33) / 2)]
luego
DT[5:8, dist := (31103.27 + "something" * (31124.98 - 31103.27) / 5)]
etc ...
2 otras opciones:
1) junta rodante:
DT[is.na(dist), dist := {
x0y0 <- DT[!is.na(dist)][.SD, on=.(time), roll=Inf, .(time=x.time, dist=x.dist)]
x1y1 <- DT[!is.na(dist)][.SD, on=.(time), roll=-Inf, .(time=x.time, dist=x.dist)]
(x1y1$dist - x0y0$dist) / (x1y1$time - x0y0$time) * (time - x0y0$time) + x0y0$dist
}]
DT
2) otra variante cercana de respuesta
nafill
usando
nafill
DT[, dist := {
y0 <- nafill(dist, "locf")
x0 <- nafill(replace(time, is.na(dist), NA), "locf")
y1 <- nafill(dist, "nocb")
x1 <- nafill(replace(time, is.na(dist), NA), "nocb")
fifelse(is.na(dist), (y1 - y0) / (x1 - x0) * (time - x0) + y0, dist)
}]
código de tiempo:
library(data.table)
set.seed(0L)
# df=data.frame(time=seq(7173,7195,1),dist=c(31091.33,NA,31100.00,31103.27,NA,NA,NA,NA,31124.98,NA,31132.81,NA,NA,NA,NA,31154.19,NA,31161.47,NA,NA,NA,NA,31182.97))
# DT=data.table(df)
nr <- 1e6
nNA <- nr/2
DT <- data.table(time=1:nr, dist=replace(rnorm(nr), sample(1:nr, nNA), NA_real_))
DT00 <- copy(DT)
DT01 <- copy(DT)
DT1 <- copy(DT)
DT20 <- copy(DT)
DT21 <- copy(DT)
mtd00 <- function() {
DT00[, g := rleid(is.na(dist))]
DT00[is.na(dist), dist := {
i <- .I[c(1, .N)] + c(-1, 1)
DT00[i[1]:i[2], approx(dist, n = .N)$y[-c(1, .N)]]
}, by = g]
}
mtd01 <- function() {
DT01[, g := rleid(is.na(dist))]
DT01[is.na(dist), dist := {
i <- .I[c(1, .N)] + c(-1, 1)
DT01[i[1]:i[2], dist[1] + 1:(.N - 2)*(dist[.N] - dist[1])/(.N - 1)]
}, by = g]
}
mtd1 <- function() {
DT1[,dist_before := nafill(dist, "locf")]
DT1[,dist_after := nafill(dist, "nocb")]
DT1[, rle := rleid(dist)][,missings := max(.N + 1 , 2), by = rle][]
DT1[is.na(dist), dist := dist_before + .SD[,.I] *
(dist_after - dist_before)/(missings), by = rle]
}
mtd20 <- function() {
DT20[is.na(dist), dist := {
x0y0 <- DT[!is.na(dist)][.SD, on=.(time), roll=Inf, .(time=x.time, dist=x.dist)]
x1y1 <- DT[!is.na(dist)][.SD, on=.(time), roll=-Inf, .(time=x.time, dist=x.dist)]
(x1y1$dist - x0y0$dist) / (x1y1$time - x0y0$time) * (time - x0y0$time) + x0y0$dist
}]
}
mtd21 <- function() {
DT21[, dist := {
y0 <- nafill(dist, "locf")
x0 <- nafill(replace(time, is.na(dist), NA), "locf")
y1 <- nafill(dist, "nocb")
x1 <- nafill(replace(time, is.na(dist), NA), "nocb")
fifelse(is.na(dist), (y1 - y0) / (x1 - x0) * (time - x0) + y0, dist)
}]
}
bench::mark(#mtd00(), mtd01(),
mtd1(), mtd20(), mtd21(), check=FALSE)
# all.equal(mtd01()$dist, mtd1()$dist)
#[1] TRUE
# all.equal(mtd20()$dist, mtd1()$dist)
#[1] TRUE
# all.equal(mtd21()$dist, mtd1()$dist)
#[1] TRUE
tiempos:
# A tibble: 3 x 13
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
<bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
1 mtd1() 834.6ms 834.6ms 1.20 7.91GB 2.40 1 2 834.57ms <df[,6] [1,000,000 x 6]> <df[,3] [1,814,567 x 3]> <bch:tm> <tibble [1 x 3]>
2 mtd20() 40.2ms 42.4ms 19.6 99.53MB 3.92 10 2 509.89ms <df[,2] [1,000,000 x 2]> <df[,3] [301 x 3]> <bch:tm> <tibble [10 x 3]>
3 mtd21() 37.5ms 60.1ms 3.89 76.46MB 1.95 8 4 2.05s <df[,2] [1,000,000 x 2]> <df[,3] [225 x 3]> <bch:tm> <tibble [8 x 3]>
editar: para abordar el comentario sobre el uso de
is.na(dist)
varias veces:
set.seed(0L)
nr <- 1e7
nNA <- nr/2
DT <- data.table(time=1:nr, dist=replace(rnorm(nr), sample(1:nr, nNA), NA_real_))
DT20 <- copy(DT)
DT201 <- copy(DT)
DT202 <- copy(DT)
mtd20 <- function() {
DT20[is.na(dist), dist := {
x0y0 <- DT20[!is.na(dist)][.SD, on=.(time), roll=Inf, .(time=x.time, dist=x.dist)]
x1y1 <- DT20[!is.na(dist)][.SD, on=.(time), roll=-Inf, .(time=x.time, dist=x.dist)]
(x1y1$dist - x0y0$dist) / (x1y1$time - x0y0$time) * (time - x0y0$time) + x0y0$dist
}]
}
mtd201 <- function() {
i <- DT201[, is.na(dist)]
DT201[(i), dist := {
x0y0 <- DT201[(!i)][.SD, on=.(time), roll=Inf, .(time=x.time, dist=x.dist)]
x1y1 <- DT201[(!i)][.SD, on=.(time), roll=-Inf, .(time=x.time, dist=x.dist)]
(x1y1$dist - x0y0$dist) / (x1y1$time - x0y0$time) * (time - x0y0$time) + x0y0$dist
}]
}
mtd202 <- function() {
i <- DT201[is.na(dist), which=TRUE]
DT201[i, dist := {
x0y0 <- DT201[-i][.SD, on=.(time), roll=Inf, .(time=x.time, dist=x.dist)]
x1y1 <- DT201[-i][.SD, on=.(time), roll=-Inf, .(time=x.time, dist=x.dist)]
(x1y1$dist - x0y0$dist) / (x1y1$time - x0y0$time) * (time - x0y0$time) + x0y0$dist
}]
}
tiempos:
# A tibble: 3 x 13
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
<bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
1 mtd20() 24.1ms 25.8ms 37.5 1.01GB 13.6 11 4 294ms <df[,2] [10,000,000 x 2]> <df[,3] [310 x 3]> <bch:tm> <tibble [15 x 3]>
2 mtd201() 24.8ms 25.6ms 38.2 954.07MB 8.19 14 3 366ms <df[,2] [10,000,000 x 2]> <df[,3] [398 x 3]> <bch:tm> <tibble [17 x 3]>
3 mtd202() 24ms 25.6ms 38.3 76.39MB 8.22 14 3 365ms <df[,2] [10,000,000 x 2]> <df[,3] [241 x 3]> <bch:tm> <tibble [17 x 3]>
No se ven muchas diferencias en los tiempos cuando se reduce la cantidad de
is.na(dist)
Aquí hay un enfoque rcpp que recorre todo una vez con pase adicional para todos los elementos de NA.
#include <Rcpp.h>
using namespace Rcpp;
// Enable C++11 via this plugin (Rcpp 0.10.3 or later)
// [[Rcpp::plugins(cpp11)]]
// [[Rcpp::export]]
NumericVector rcpp_approx(NumericVector y) {
double start = 0;
double slope = 0;
int count = 0;
int first_non_na = 0;
NumericVector x = clone(y); //added to not update-by-reference
//need to handle vectors that start with NA. This will return "NA"
for(int k = 0; k < x.size(); k++) {
if (!NumericVector::is_na(x[k])) break;
first_non_na = k + 1;
}
//iterator seems similar to the idea of *apply with the added benefit
// that we can reference relative to the element being acted on.
for(NumericVector::iterator i = std::next(x.begin(), first_non_na); i != x.end(); ++i){
if (NumericVector::is_na(*i)){
count++;
} else {
if (count != 0) {
start = *(i-(count+1));
slope = (*i - start) / (count + 1);
for (int j = 0; j < count; j++){
*(i-(count-j)) = start + slope * (j + 1);
}
count = 0;
}
}
}
return(x);
}
Luego en R:
DT[, rcpp_approx(dist)]
El código se explica en línea.
Puede eliminar las columnas temporales usando
df[,dist_before := NULL]
, por ejemplo.
library(data.table)
df=data.table(time=seq(7173,7195,1),dist=c(31091.33,NA,31100.00,31103.27,NA,NA,NA,
NA,31124.98,NA,31132.81,NA,NA,NA,NA,31154.19,NA,31161.47,NA,NA,NA,NA,31182.97))
df
#> time dist
#> 1: 7173 31091.33
#> 2: 7174 NA
#> 3: 7175 31100.00
#> 4: 7176 31103.27
#> 5: 7177 NA
#> 6: 7178 NA
#> 7: 7179 NA
#> 8: 7180 NA
#> 9: 7181 31124.98
#> 10: 7182 NA
#> 11: 7183 31132.81
#> 12: 7184 NA
#> 13: 7185 NA
#> 14: 7186 NA
#> 15: 7187 NA
#> 16: 7188 31154.19
#> 17: 7189 NA
#> 18: 7190 31161.47
#> 19: 7191 NA
#> 20: 7192 NA
#> 21: 7193 NA
#> 22: 7194 NA
#> 23: 7195 31182.97
#> time dist
# Carry forward the last non-missing observation
df[,dist_before := nafill(dist, "locf")]
# Bring back the next non-missing dist
df[,dist_after := nafill(dist, "nocb")]
# rleid will create groups based on run-lengths of values within the data.
# This means 4 NA''s in a row will be grouped together, for example.
# We then count the missings and add 1, because we want the
# last NA before the next non-missing to be less than the non-missing value.
df[, rle := rleid(dist)][,missings := max(.N + 1 , 2), by = rle][]
#> time dist dist_before dist_after rle missings
#> 1: 7173 31091.33 31091.33 31091.33 1 2
#> 2: 7174 NA 31091.33 31100.00 2 2
#> 3: 7175 31100.00 31100.00 31100.00 3 2
#> 4: 7176 31103.27 31103.27 31103.27 4 2
#> 5: 7177 NA 31103.27 31124.98 5 5
#> 6: 7178 NA 31103.27 31124.98 5 5
#> 7: 7179 NA 31103.27 31124.98 5 5
#> 8: 7180 NA 31103.27 31124.98 5 5
#> 9: 7181 31124.98 31124.98 31124.98 6 2
#> 10: 7182 NA 31124.98 31132.81 7 2
#> 11: 7183 31132.81 31132.81 31132.81 8 2
#> 12: 7184 NA 31132.81 31154.19 9 5
#> 13: 7185 NA 31132.81 31154.19 9 5
#> 14: 7186 NA 31132.81 31154.19 9 5
#> 15: 7187 NA 31132.81 31154.19 9 5
#> 16: 7188 31154.19 31154.19 31154.19 10 2
#> 17: 7189 NA 31154.19 31161.47 11 2
#> 18: 7190 31161.47 31161.47 31161.47 12 2
#> 19: 7191 NA 31161.47 31182.97 13 5
#> 20: 7192 NA 31161.47 31182.97 13 5
#> 21: 7193 NA 31161.47 31182.97 13 5
#> 22: 7194 NA 31161.47 31182.97 13 5
#> 23: 7195 31182.97 31182.97 31182.97 14 2
#> time dist dist_before dist_after rle missings
# .SD[,.I] will get us the row number relative to the group it is in.
# For example, row 5 dist is calculated as
# dist_before + 1 * (dist_after - dist_before)/5
df[is.na(dist), dist := dist_before + .SD[,.I] *
(dist_after - dist_before)/(missings), by = rle]
df[]
#> time dist dist_before dist_after rle missings
#> 1: 7173 31091.33 31091.33 31091.33 1 2
#> 2: 7174 31095.67 31091.33 31100.00 2 2
#> 3: 7175 31100.00 31100.00 31100.00 3 2
#> 4: 7176 31103.27 31103.27 31103.27 4 2
#> 5: 7177 31107.61 31103.27 31124.98 5 5
#> 6: 7178 31111.95 31103.27 31124.98 5 5
#> 7: 7179 31116.30 31103.27 31124.98 5 5
#> 8: 7180 31120.64 31103.27 31124.98 5 5
#> 9: 7181 31124.98 31124.98 31124.98 6 2
#> 10: 7182 31128.90 31124.98 31132.81 7 2
#> 11: 7183 31132.81 31132.81 31132.81 8 2
#> 12: 7184 31137.09 31132.81 31154.19 9 5
#> 13: 7185 31141.36 31132.81 31154.19 9 5
#> 14: 7186 31145.64 31132.81 31154.19 9 5
#> 15: 7187 31149.91 31132.81 31154.19 9 5
#> 16: 7188 31154.19 31154.19 31154.19 10 2
#> 17: 7189 31157.83 31154.19 31161.47 11 2
#> 18: 7190 31161.47 31161.47 31161.47 12 2
#> 19: 7191 31165.77 31161.47 31182.97 13 5
#> 20: 7192 31170.07 31161.47 31182.97 13 5
#> 21: 7193 31174.37 31161.47 31182.97 13 5
#> 22: 7194 31178.67 31161.47 31182.97 13 5
#> 23: 7195 31182.97 31182.97 31182.97 14 2
#> time dist dist_before dist_after rle missings
Puede usar la función
approx
para hacer interpolación lineal.
Para cada grupo de
NA
s, obtenga ese subconjunto de
DT
más las filas antes y después.
Luego aplique
approx
a este subconjunto del vector
dist
, con el argumento
n
de
approx
igual al número de filas en el subconjunto
.N
.
DT[, g := rleid(dist)]
DT[is.na(dist), dist := {
i <- .I[c(1, .N)] + c(-1, 1)
DT[i[1]:i[2], approx(dist, n = .N)$y[-c(1, .N)]]
}, by = g]
O sin
approx
DT[, g := rleid(dist)]
DT[is.na(dist), dist := {
i <- .I[c(1, .N)] + c(-1, 1)
DT[i[1]:i[2], dist[1] + 1:(.N - 2)*(dist[.N] - dist[1])/(.N - 1)]
}, by = g]
Usando la
library(zoo)
DT[, dist := na.approx(dist)]
Alternativamente, si prefiere apegarse a las funciones base R en lugar de usar otro paquete, puede hacerlo
DT[, dist := approx(.I, dist, .I)$y]