背景:
我有一串连续的时间戳数据。
这个数据序列中有一些缺口,有些很大,有些只是一个缺失的值。
每当出现仅有一个缺失值的缺口时,我希望用一个虚拟值来填补这个缺口(将忽略更大的缺口)。
我想要使用懒生成的方法来生成修补后的序列,因此我使用了Seq.unfold。
我制作了两个版本的修补数据方法。
第一个消耗具有间隙数据的序列并生成修补后的序列。 这正是我想要的,但是当输入序列中的元素数量上升到1000以上时,该方法运行得非常缓慢,并且随着输入序列包含的元素越来越多,情况会变得越来越糟糕。
第二个方法消耗一个带缺口数据的列表并生成修补后的序列,它运行速度很快。但这不是我想要的,因为这会强制在内存中实例化整个输入列表。
我希望使用(序列-> 序列)方法而不是(list -> 序列)方法,以避免同时将整个输入列表存储在内存中。
问题:
1) 为什么第一个方法如此缓慢(随着更大的输入列表而逐渐变得越来越糟) (我怀疑这与不断使用Seq.skip 1来创建新序列有关,但我不确定)
2) 如何在使用输入序列而不是输入列表的情况下快速修补数据中的缺口?
代码:
open System
// Method 1 (Slow)
let insertDummyValuesWhereASingleValueIsMissing1 (timeBetweenContiguousValues : TimeSpan) (values : seq<(DateTime * float)>) =
let sizeOfHolesToPatch = timeBetweenContiguousValues.Add timeBetweenContiguousValues // Only insert dummy-values when the gap is twice the normal
(None, values) |> Seq.unfold (fun (prevValue, restOfValues) ->
if restOfValues |> Seq.isEmpty then
None // Reached the end of the input seq
else
let currentValue = Seq.hd restOfValues
if prevValue.IsNone then
Some(currentValue, (Some(currentValue), Seq.skip 1 restOfValues )) // Only happens to the first item in the seq
else
let currentTime = fst currentValue
let prevTime = fst prevValue.Value
let timeDiffBetweenPrevAndCurrentValue = currentTime.Subtract(prevTime)
if timeDiffBetweenPrevAndCurrentValue = sizeOfHolesToPatch then
let dummyValue = (prevTime.Add timeBetweenContiguousValues, 42.0) // 42 is chosen here for obvious reasons, making this comment superfluous
Some(dummyValue, (Some(dummyValue), restOfValues))
else
Some(currentValue, (Some(currentValue), Seq.skip 1 restOfValues))) // Either the two values were contiguous, or the gap between them was too large to patch
// Method 2 (Fast)
let insertDummyValuesWhereASingleValueIsMissing2 (timeBetweenContiguousValues : TimeSpan) (values : (DateTime * float) list) =
let sizeOfHolesToPatch = timeBetweenContiguousValues.Add timeBetweenContiguousValues // Only insert dummy-values when the gap is twice the normal
(None, values) |> Seq.unfold (fun (prevValue, restOfValues) ->
match restOfValues with
| [] -> None // Reached the end of the input list
| currentValue::restOfValues ->
if prevValue.IsNone then
Some(currentValue, (Some(currentValue), restOfValues )) // Only happens to the first item in the list
else
let currentTime = fst currentValue
let prevTime = fst prevValue.Value
let timeDiffBetweenPrevAndCurrentValue = currentTime.Subtract(prevTime)
if timeDiffBetweenPrevAndCurrentValue = sizeOfHolesToPatch then
let dummyValue = (prevTime.Add timeBetweenContiguousValues, 42.0)
Some(dummyValue, (Some(dummyValue), currentValue::restOfValues))
else
Some(currentValue, (Some(currentValue), restOfValues))) // Either the two values were contiguous, or the gap between them was too large to patch
// Test data
let numbers = {1.0..10000.0}
let contiguousTimeStamps = seq { for n in numbers -> DateTime.Now.AddMinutes(n)}
let dataWithOccationalHoles = Seq.zip contiguousTimeStamps numbers |> Seq.filter (fun (dateTime, num) -> num % 77.0 <> 0.0) // Has a gap in the data every 77 items
let timeBetweenContiguousValues = (new TimeSpan(0,1,0))
// The fast sequence-patching (method 2)
dataWithOccationalHoles |> List.of_seq |> insertDummyValuesWhereASingleValueIsMissing2 timeBetweenContiguousValues |> Seq.iter (fun pair -> printfn "%f %s" (snd pair) ((fst pair).ToString()))
// The SLOOOOOOW sequence-patching (method 1)
dataWithOccationalHoles |> insertDummyValuesWhereASingleValueIsMissing1 timeBetweenContiguousValues |> Seq.iter (fun pair -> printfn "%f %s" (snd pair) ((fst pair).ToString()))