如何找到日期和时间的重叠?

4

我有一个包含产品和区间的数据框:

df <- data.frame(Product=c("x","y","z", "x"), 
      Start_Date=c("1/1/2015 08:00", "3/1/2015 10:00", "4/1/2015 09:00", "4/1/2015 12:34"), 
      End_Date=c("2/1/2015 09:00","5/1/2015 12:00","5/1/2015 13:00", "7/1/2015 11:23"))

  Product     Start_Date       End_Date
1       x 1/1/2015 08:00 2/1/2015 09:00
2       y 3/1/2015 10:00 5/1/2015 12:00
3       z 4/1/2015 09:00 5/1/2015 13:00
4       x 4/1/2015 12:34 7/1/2015 11:23

我希望找出产品的日期和时间之间的重叠部分,将它们拆分成互斥集合,并记录每个区间发生的产品。

最终结果应类似于这样:

df2 <- data.frame(Product=c("x","y", "y/z", "y/z/x", "z/x", "x"), 
     Start_Date=c("1/1/2015 08:00", "3/1/2015 10:00", "4/1/2015 09:00", "4/1/2015 12:34","5/1/2015 
                  12:00", "5/1/2015 13:00"), 
    End_Date=c("2/1/2015 09:00", "4/1/2015 09:00", "4/1/2015 12:34", "5/1/2015 12:00", "5/1/2015 
                  13:00", "7/1/2015 11:23"))

  Product     Start_Date       End_Date
1       x 1/1/2015 08:00 2/1/2015 09:00
2       y 3/1/2015 10:00 4/1/2015 09:00
3     y/z 4/1/2015 09:00 4/1/2015 12:34
4   y/z/x 4/1/2015 12:34 5/1/2015 12:00
5     z/x 5/1/2015 12:00 5/1/2015 13:00
6       x 5/1/2015 13:00 7/1/2015 11:23
4个回答

3

我想象这个问题比你表面上描述的要具有挑战性得多。这些区间重叠问题可以使用IRanges包很好地处理。

首先,让我们将日期转换为整数以进行快速处理:

df[,-1] <- lapply(df[,-1],function(x) as.integer(as.POSIXct(x,"%m/%d/%Y %H:%M",tz = "America/New_York")))
df
  Product Start_Date   End_Date
1       x 1420099200 1422781200
2       y 1425204000 1430481600
3       z 1427878800 1430485200
4       x 1427891640 1435749780

现在我们可以转换为IRanges:
library(IRanges)
df.ranges <- IRanges(df$Start_Date,end = df$End_Date,names = df$Product)
df.ranges
IRanges object with 4 ranges and 0 metadata columns:
         start        end     width
     <integer>  <integer> <integer>
  x 1420099200 1422781200   2682001
  y 1425204000 1430481600   5277601
  z 1427878800 1430485200   2606401
  x 1427891640 1435749780   7858141

现在我们可以使用disjoin将范围分割成一组重叠的区间:

df.disjoin <- disjoin(df.ranges)
df.disjoin
IRanges object with 6 ranges and 0 metadata columns:
           start        end     width
       <integer>  <integer> <integer>
  [1] 1420099200 1422781200   2682001
  [2] 1425204000 1427878799   2674800
  [3] 1427878800 1427891639     12840
  [4] 1427891640 1430481600   2589961
  [5] 1430481601 1430485200      3600
  [6] 1430485201 1435749780   5264580

现在我们可以找到分离的区间和原始区间之间的重叠部分:

hits <- findOverlaps(df.disjoin,df.ranges)
hits
Hits object with 10 hits and 0 metadata columns:
       queryHits subjectHits
       <integer>   <integer>
   [1]         1           1
   [2]         2           2
   [3]         3           2
   [4]         3           3
   [5]         4           2
   [6]         4           3
   [7]         4           4
   [8]         5           3
   [9]         5           4
  [10]         6           4
  -------
  queryLength: 6 / subjectLength: 4

从这里我们可以获得期望的结果:

library(tidyverse)
as_tibble(hits) %>%
   mutate(Product = names(df.ranges)[subjectHits]) %>%
   group_by(queryHits) %>%
   summarise(Product = paste(Product,collapse = "/")) -> result
result
# A tibble: 6 x 2
  queryHits Product
      <int> <chr>  
1         1 x      
2         2 y      
3         3 y/z    
4         4 y/z/x  
5         5 z/x    
6         6 x   

现在我们可以将结果绑定在一起,并格式化为可识别的日期:
bind_cols(as.data.frame(df.disjoin),result) %>%
  mutate(across(start:end, ~as.POSIXlt(.,origin = "1970-01-01 00:00.00 UTC")))
                start                 end   width queryHits Product
1 2015-01-01 08:00:00 2015-02-01 09:00:00 2682001         1       x
2 2015-03-01 10:00:00 2015-04-01 08:59:59 2671200         2       y
3 2015-04-01 09:00:00 2015-04-01 12:33:59   12840         3     y/z
4 2015-04-01 12:34:00 2015-05-01 12:00:00 2589961         4   y/z/x
5 2015-05-01 12:00:01 2015-05-01 13:00:00    3600         5     z/x
6 2015-05-01 13:00:01 2015-07-01 11:23:00 5264580         6       x

这种方法在内存可以容纳的几乎所有区间中都可以非常快速地处理。


1
很棒的回答!从你这里学到了 IRanges。点赞! - ThomasIsCoding

2

这里提供了一种不使用任何额外软件包的方法:

首先,我们合并所有起始日期和结束日期,然后按升序排列它们。这样,我们得到了一个包含所有时间段的表格。

df$Start_Date = strptime(as.character(df$Start_Date),format = "%d/%m/%Y %H:%M")
df$End_Date = strptime(as.character(df$End_Date),format = "%d/%m/%Y %H:%M")

dates=sort(unique(c(df[,2],df[,3])))

df2=data.frame(start=dates[1:length(dates)-1],end=dates[2:length(dates)])

对于每个时间段,我们检查在给定时间段内哪个产品处于活动状态。如果没有这样的产品,则返回“NA”。

active<-function(start,end){
  tmp<-as.vector(df[df$Start_Date<=start & df$End_Date>= end,1])
  if(length(tmp)>0){
    paste(tmp, collapse="/")
  }
  else{
    return(NA)
  }
}

activeproducts <- mapply(active,df2[,1],df2[,2])
df2<-transform(df2,products=activeproducts)

为了去除没有活动产品的时间段,我们使用 complete.cases 函数。
df2<- df2[complete.cases(df2),]

>df2

                start                 end products
1 2015-01-01 08:00:00 2015-01-02 09:00:00        x
3 2015-01-03 10:00:00 2015-01-04 09:00:00        y
4 2015-01-04 09:00:00 2015-01-04 12:34:00      y/z
5 2015-01-04 12:34:00 2015-01-05 12:00:00    y/z/x
6 2015-01-05 12:00:00 2015-01-05 13:00:00      z/x
7 2015-01-05 13:00:00 2015-01-07 11:23:00        x

1

这里有一种使用data.table包的选项。

datecols <- c("Start_Date", "End_Date")
setDT(df)[,
  (datecols) := lapply(.SD, as.POSIXct, format = "%d/%m/%Y %H:%M"),
  .SDcols = datecols
]
d <- sort(unlist(df[, datecols, with = FALSE]))
res <- df[data.table(s = first(d, -1), e = last(d, -1)),
  on = .(Start_Date <= s, End_Date >= e)
][
  !is.na(Product),
  .(Product = paste0(Product, collapse = "/")),
  datecols
][
  ,
  (datecols) := lapply(.SD, as.POSIXct, origin = "1970-01-01"),
  .SDcols = datecols
][]

使得
> res
            Start_Date            End_Date Product
1: 2015-01-01 08:00:00 2015-01-02 09:00:00       x
2: 2015-01-03 10:00:00 2015-01-04 09:00:00       y
3: 2015-01-04 09:00:00 2015-01-04 12:34:00     y/z
4: 2015-01-04 12:34:00 2015-01-05 12:00:00   y/z/x
5: 2015-01-05 12:00:00 2015-01-05 13:00:00     z/x
6: 2015-01-05 13:00:00 2015-01-07 11:23:00       x

IRanges 的开销相当大。除非有很多间隔,否则您的方法可能更有效。 - Ian Campbell

0
尝试使用 `lubridate` 包:
library(lubridate)
library(purrr)
library(dplyr)

df <- df %>% mutate(Start_Date=strptime(Start_Date,"%d/%m/%Y %H:%M"),
                    End_Date=strptime(End_Date,"%d/%m/%Y %H:%M"))

dates <- c(df$Start_Date,df$End_Date) %>% unique(.)
dates <- dates[order(dates)]

ints <- map2(as.list(dates[-length(dates)]),as.list(dates[-1]),~interval(.x,.y))

i <- map2(df$Start_Date+1,df$End_Date-1,~interval(.x,.y))

p2 <- map_chr(ints,~paste(df$Product[map_lgl(i,function(x)int_overlaps(.,x))],collapse="/"))

df2 <- tibble(
  Product = p2,
  Start_Date = do.call(c,map(ints,int_start)),
  End_Date = do.call(c,map(ints,int_end))
)

网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接