我阅读了类似的问题,通过使用窗口函数可以使其起作用,但是因为ClickHouse似乎不支持它们,所以我正在寻找替代解决方案。
给定时间间隔,如(1,5),(2,3),(3,8),(10,15),我想要将重叠的时间区间"合并"成单个区间。在我的例子中,它将是:
(1,8)和(10,15)。
非常感谢您提供任何指导!谢谢!
我阅读了类似的问题,通过使用窗口函数可以使其起作用,但是因为ClickHouse似乎不支持它们,所以我正在寻找替代解决方案。
给定时间间隔,如(1,5),(2,3),(3,8),(10,15),我想要将重叠的时间区间"合并"成单个区间。在我的例子中,它将是:
(1,8)和(10,15)。
非常感谢您提供任何指导!谢谢!
[(1, 5), (2, 3), (3, 8), (6, 9), (11, 15)]
它将返回你
[(1, 8), (3, 9), (11, 15)]
第二个是正确的,但需要将所有时间间隔转换为点列表。在我的情况下,我有一个日期时间间隔,因此我必须创建非常大的数组,其中包含间隔中每秒的值。这会很耗费资源。
这就是为什么我尝试寻找解决方案,并希望我已经找到了。
SELECT [ (1,5), (3, 7), (5, 10), (6,7), (7,8), (9, 12), (15, 20), (16, 22) ] AS sortedIntervals,
-- collect a new array with finishes of all intervals
arrayMap(i -> i.2, sortedIntervals) as finishes,
-- create an array with cumulative finishes (merge small internal intervals and increase finish)
arrayMap( (i, y) -> arrayReduce('max', arraySlice(finishes, 1, y)), finishes, arrayEnumerate(finishes)) AS cumulative_finish,
-- get sequence number of intervals, that will generate a new intervals (which will not be merged)
arrayFilter ( i -> i > 0 ,
arrayMap( (i, y) ->
-- there is no check for element existing, because array[0] will return 0
-- if start of current interval is bigger then cumulative finish of previous one - it's a new interval
( i.1 > cumulative_finish[y-1] )
? y
: 0, sortedIntervals, arrayEnumerate(sortedIntervals))
) AS new_interval_indexes,
-- get result's intervals.
-- the first start - it's our initial start, finish - it's a value from cumulative finish of previous for new start interval
arrayMap( (i, n ) ->
-- there is no checking on existing element, because array[<unexisting_element>] return 0, array[-1] return last element, that we want
( sortedIntervals[i].1, cumulative_finish[new_interval_indexes[n+1]-1] ) ,
new_interval_indexes, arrayEnumerate(new_interval_indexes)
) as result
它将返回
[(1, 12), (15, 22)]
SELECT
intervals,
arraySort(x -> x, intervals) sortedIntervals,
/* try to merge each interval with precede ones */
arrayMap((x, index) -> index != 1
? (arrayReduce(
'min',
arrayMap(
i -> sortedIntervals[i + 1].1,
/* get indexes of intervals that can be merged with the current one (index is zero-based) */
arrayFilter(
i -> x.1 <= sortedIntervals[i + 1].2 AND x.2 >= sortedIntervals[i + 1].1,
range(index)))),
arrayReduce(
'max',
arrayMap(
i -> sortedIntervals[i + 1].2,
/* get indexes of intervals that can be merged with the current one (index is zero-based) */
arrayFilter(
i -> x.1 <= sortedIntervals[i + 1].2 AND x.2 >= sortedIntervals[i + 1].1,
range(index)))))
: x,
sortedIntervals,
arrayEnumerate(sortedIntervals)) rawResult,
/* filter out intervals nested to other ones */
arrayFilter(
(x, index) -> index == length(rawResult) OR x.1 != rawResult[index + 1].1,
rawResult,
arrayEnumerate(rawResult)) result
FROM
(
SELECT [(1, 5), (2, 3), (3, 8), (10, 15)] intervals
UNION ALL
SELECT [(2, 4), (1, 3), (3, 6), (12, 14), (7, 7), (13, 16), (9, 9), (8, 9), (10, 15)]
UNION ALL
SELECT [(20, 22), (18, 18), (16, 21), (1, 8), (2, 9), (3, 5), (10, 12), (11, 13), (14, 15)]
UNION ALL
SELECT []
UNION ALL
SELECT [(1, 11)]
)
FORMAT Vertical;
/*
Row 1:
──────
intervals: [(2,4),(1,3),(3,6),(12,14),(7,7),(13,16),(9,9),(8,9),(10,15)]
sortedIntervals: [(1,3),(2,4),(3,6),(7,7),(8,9),(9,9),(10,15),(12,14),(13,16)]
rawResult: [(1,3),(1,4),(1,6),(7,7),(8,9),(8,9),(10,15),(10,15),(10,16)]
result: [(1,6),(7,7),(8,9),(10,16)]
Row 2:
──────
intervals: [(1,5),(2,3),(3,8),(10,15)]
sortedIntervals: [(1,5),(2,3),(3,8),(10,15)]
rawResult: [(1,5),(1,5),(1,8),(10,15)]
result: [(1,8),(10,15)]
Row 3:
──────
intervals: [(20,22),(18,18),(16,21),(1,8),(2,9),(3,5),(10,12),(11,13),(14,15)]
sortedIntervals: [(1,8),(2,9),(3,5),(10,12),(11,13),(14,15),(16,21),(18,18),(20,22)]
rawResult: [(1,8),(1,9),(1,9),(10,12),(10,13),(14,15),(16,21),(16,21),(16,22)]
result: [(1,9),(10,13),(14,15),(16,22)]
Row 4:
──────
intervals: []
sortedIntervals: []
rawResult: []
result: []
Row 5:
──────
intervals: [(1,11)]
sortedIntervals: [(1,11)]
rawResult: [(1,11)]
result: [(1,11)]
*/
另一种测试解决方案:
select
id,
data.1 as period_group,
min(data.2) as start,
max(data.3) as end,
end - start as diff
from
( select
id,
groupArray((start, end)) as arr,
arraySort(
y -> y.1,
arrayFilter(
x ->
not(arrayExists(
a ->
a.1 <= x.1
and a.2 > x.2
or a.1 < x.1
and a.2 >= x.2,
arr)),
arr)) as arr_clear,
arrayMap(
x, y -> (x, y),
arr_clear,
arrayPushFront(
arrayPopBack(arr_clear),
arr_clear[1])) as arr_w_prev,
arrayMap(
x, y -> (y, x.1, x.2),
arr_clear,
arrayCumSum(
x -> x.1.1 > x.2.2,
arr_w_prev)) as arr_groups
from
values (
'id String, start UInt8, end UInt8',
('a', 1, 10),
('a', 2, 8),
('a', 7, 15),
('a', 20, 25),
('b', 116, 130),
('b', 111, 114),
('b', 107, 109),
('b', 103, 105),
('b', 100, 120),
('c', 50, 60),
('c', 50, 70),
('d', 1, 5),
('d', 2, 3),
('d', 3, 8),
('d', 6, 9),
('d', 11, 15),
('e', 1, 5),
('e', 3, 7),
('e', 5, 10),
('e', 6, 7),
('e', 7, 8),
('e', 9, 12),
('e', 15, 20),
('e', 16, 22),
('f', 1, 8),
('f', 2, 9),
('f', 3, 5),
('f', 10, 12),
('f', 11, 13),
('f', 14, 15),
('f', 16, 21),
('f', 18, 18),
('f', 20, 21)
)
group by
id) as t
array join arr_groups as data
group by
id,
period_group
order by
id,
period_group;
SELECT arraySplit((x, y) -> y, sortedDisArray, arrayMap(x -> if(x <= 1, 0, 1), arrayDifference(sortedDisArray))) AS z,
arrayMap(el -> (arrayReduce('min', el), arrayReduce('max', el)), z) as result
FROM
(
SELECT [(1, 5), (2, 3), (3, 8), (10, 15)] AS a,
arrayMap(x -> x.2 -x.1 ,a) as durations,
arrayMap((i, y) ->
arrayMap(n -> n + a[y].1 ,
range(toUInt64(durations[y]+1))) , durations, arrayEnumerate(a)) as disArray,
arraySort(arrayDistinct(flatten(disArray))) as sortedDisArray
)