问题描述:

I'm analyzing Dual-Polarization radar data and I want to add the result of the Marshall Palmer relation as an ID-level variable in my data.

There's no CRAN function for this that I can find, but another R user has script wherein he applies the relation as an estimate of an expected value in the data:

# From Troy W. (thanks!)

# A few small changes by hack-r

## Someone better in R than me could probably clean up/refactor the code a bit.

library(dplyr)

library(data.table)

test <- fread('../input/test.csv')

mpalmer <- function(ref, minutes_past) {

# order reflectivity values and minutes_past

sort_min_index = order(minutes_past)

minutes_past <- minutes_past[sort_min_index]

ref <- ref[sort_min_index]

# calculate the length of time for which each reflectivity value is valid

valid_time <- rep(0, length(minutes_past))

valid_time[1] <- minutes_past[1]

if (length(valid_time) > 1) {

for (i in seq(2, length(minutes_past))) {

valid_time[i] <- minutes_past[i] - minutes_past[i-1]

}

valid_time[length(valid_time)] = valid_time[length(valid_time)] + 60 - sum(valid_time)

} else {

# if only 1 observation, make it valid for the entire hour

valid_time <- 60

}

valid_time = valid_time / 60

# calculate hourly rain rates using marshall-palmer weighted by valid times

sum <- 0

for (i in seq(length(ref))) {

if (!is.na(ref[i])) {

mmperhr <- ((10^(ref[i]/10))/200) ^ 0.625

sum <- sum + mmperhr * valid_time[i]

}

}

return(sum)

}

results <- test %>% group_by(Id) %>% summarize(Expected=sum)

write.csv(results, file='sample_solution.csv', row.names=FALSE)

In addition to being incredibly slow with Big Data, the problem with the code above is that it doesn't create a column of results within the original data, which would allow it to be productionalized in an ETL pipeline that created this relation at the ID level as 1 predictive variable in the dataset.

I tried rewriting the function like this:

mpalmer <- function(ref, minutes_past) {

# Credit to Troy for this

# edits by Jason Miller, hack-r.com

# order reflectivity values and minutes_past

sort_min_index = order(minutes_past)

minutes_past <- minutes_past[sort_min_index]

ref <- ref[sort_min_index]

# calculate the length of time for which each reflectivity value is valid

valid_time <- rep(0, length(minutes_past))

valid_time[1] <- minutes_past[1]

if (length(valid_time) > 1) {

for (i in seq(2, length(minutes_past))) {

valid_time[i] <- minutes_past[i] - minutes_past[i-1]

}

valid_time[length(valid_time)] = valid_time[length(valid_time)] + 60 - sum(valid_time)

} else {

# if only 1 observation, make it valid for the entire hour

valid_time <- 60

}

valid_time = valid_time / 60

# calculate hourly rain rates using marshall-palmer weighted by valid times

sum <- 0

for (i in seq(length(ref))) {

if (!is.na(ref[i])) {

mmperhr <- ((10^(ref[i]/10))/200) ^ 0.625

sum <- sum + mmperhr * valid_time[i]

}

}

return(sum)

}

and then applying it like this:

train.samp$mp <- aggregate(train.samp$Ref, by=list(train.samp$Id), FUN = mpalmer, minutes_past = train.samp$minutes_past)

which I think mostly works, however after running for a long time, it returned an error like this:

Error in `$<-.data.frame`(`*tmp*`, "mp", value = list(Group.1 = c(10L, :

replacement has 9765 rows, data has 10000

I've tried it on different samples of the data and the error message is always in that format, though the specific numbers may change. There's no missing data in the dataset.

Any idea how to fix this function (and/or make it faster)?

Update: I've got it working with a for loop but it is SO slow...

网友答案:

This is what I'm going with for now, but I'm still open to other solutions.

Basically, I went back to the original function then broke apart the overly large dataset into manageable chunks and ran for loops on each chunk:

  train.samp      <- train.samp[order(train.samp$Id),]
  train.samp1     <- train.samp1[order(train.samp1$Id),]

  train.samp.id    <- unique(train.samp$Id)
  train.samp.id.1  <- train.samp.id[1:25000]
  train.samp.id.2  <- train.samp.id[25001:50000]
  train.samp.id.3  <- train.samp.id[50001:75000]
  train.samp.id.4  <- train.samp.id[75001:100000]
  train.samp.id.6  <- train.samp.id[100001:125000]
  train.samp.id.5  <- train.samp.id[125001:150000]
  train.samp.id.7  <- train.samp.id[150001:175000]
  train.samp.id.8  <- train.samp.id[175001:200000]
  train.samp.id.9  <- train.samp.id[200001:length(train.samp.id)]

  train.samp.1 <- train.samp[train.samp$Id %in% train.samp.id.1,]
  train.samp.2 <- train.samp[train.samp$Id %in% train.samp.id.2,]
  train.samp.3 <- train.samp[train.samp$Id %in% train.samp.id.3,]
  train.samp.4 <- train.samp[train.samp$Id %in% train.samp.id.4,]
  train.samp.5 <- train.samp[train.samp$Id %in% train.samp.id.5,]
  train.samp.6 <- train.samp[train.samp$Id %in% train.samp.id.6,]
  train.samp.7 <- train.samp[train.samp$Id %in% train.samp.id.7,]
  train.samp.8 <- train.samp[train.samp$Id %in% train.samp.id.8,]
  train.samp.9 <- train.samp[train.samp$Id %in% train.samp.id.9,]

  system.time(
  for(i in unique(train.samp.1$Id)){
    train.samp.1$mp[train.samp.1$Id == i] <- mpalmer(train.samp.1$Ref[train.samp.1$Id == i], minutes_past = train.samp.1$minutes_past[train.samp.1$Id == i])
  }    )
  for(i in unique(train.samp$Id[train.samp$Id %in% train.samp.id.2])){
    train.samp$mp[train.samp$Id == i] <- mpalmer(train.samp$Ref[train.samp$Id == i], minutes_past = train.samp$minutes_past[train.samp$Id == i])
  }    
  for(i in unique(train.samp$Id[train.samp$Id %in% train.samp.id.3])){
    train.samp$mp[train.samp$Id == i] <- mpalmer(train.samp$Ref[train.samp$Id == i], minutes_past = train.samp$minutes_past[train.samp$Id == i])
  }    
  for(i in unique(train.samp$Id[train.samp$Id %in% train.samp.id.4])){
    train.samp$mp[train.samp$Id == i] <- mpalmer(train.samp$Ref[train.samp$Id == i], minutes_past = train.samp$minutes_past[train.samp$Id == i])
  }    
  for(i in unique(train.samp$Id[train.samp$Id %in% train.samp.id.5])){
    train.samp$mp[train.samp$Id == i] <- mpalmer(train.samp$Ref[train.samp$Id == i], minutes_past = train.samp$minutes_past[train.samp$Id == i])
  }    
  for(i in unique(train.samp$Id[train.samp$Id %in% train.samp.id.6])){
    train.samp$mp[train.samp$Id == i] <- mpalmer(train.samp$Ref[train.samp$Id == i], minutes_past = train.samp$minutes_past[train.samp$Id == i])
  }    
  for(i in unique(train.samp$Id[train.samp$Id %in% train.samp.id.7])){
    train.samp$mp[train.samp$Id == i] <- mpalmer(train.samp$Ref[train.samp$Id == i], minutes_past = train.samp$minutes_past[train.samp$Id == i])
  }    
  for(i in unique(train.samp$Id[train.samp$Id %in% train.samp.id.8])){
    train.samp$mp[train.samp$Id == i] <- mpalmer(train.samp$Ref[train.samp$Id == i], minutes_past = train.samp$minutes_past[train.samp$Id == i])
  }    
  for(i in unique(train.samp$Id[train.samp$Id %in% train.samp.id.9])){
    train.samp$mp[train.samp$Id == i] <- mpalmer(train.samp$Ref[train.samp$Id == i], minutes_past = train.samp$minutes_past[train.samp$Id == i])
  }    

system.time(  
  for(i in unique(train.samp1$Id)){
    train.samp1$mp[train.samp1$Id == i] <- mpalmer(train.samp1$Ref[train.samp1$Id == i], minutes_past = train.samp1$minutes_past[train.samp1$Id == i])
  }   

The function is not shown here, but I am about to take advantage of @Gregor's suggestion in the comment.

相关阅读:
Top