#!/usr/bin/env python3
import time
import numpy
_DATE_RANGE = ('1925-01-01', '1992-01-01')
_DATE_FORMAT = '%Y-%m-%d'
_EMPIRICAL_SCALE_RATIO = 0.15
_DISTRIBUTION_SIZE = 1000
def main():
time_range = tuple(time.mktime(time.strptime(d, _DATE_FORMAT))
for d in _DATE_RANGE)
distribution = numpy.random.normal(
loc=(time_range[0] + time_range[1]) * 0.5,
scale=(time_range[1] - time_range[0]) * _EMPIRICAL_SCALE_RATIO,
size=_DISTRIBUTION_SIZE
)
date_range = tuple(time.strftime(_DATE_FORMAT, time.localtime(t))
for t in numpy.sort(distribution))
print(date_range)
if __name__ == '__main__':
main()
scipy.stats.truncnorm
来生成截断正态分布,而不是使用_EMPIRICAL_SCALE_RATIO
。datetime
模块的实现,它还允许生成小时、分钟、秒,并使用Numpy/Pandas友好的日期格式。from datetime import datetime
import numpy
def main(start, end, date_format, distribution_size, scale_ratio):
# Converting to timestamp
start = datetime.strptime(start, date_format).timestamp()
end = datetime.strptime(end, date_format).timestamp()
# Generate Normal Distribution
mu = datetime.strptime('1958-01-01T00:00:00', date_format).timestamp()
sigma = (end - start) * scale_ratio
total_distribution = np.random.normal(loc=mu, scale=sigma, size=distribution_size)
# Sort and Convert back to datetime
sorted_distribution = numpy.sort(total_distribution)
date_range = tuple(datetime.fromtimestamp(t) for t in sorted_distribution)
print(date_range)
start = '1925-01-01T00:00:00'
end = '1992-01-01T00:00:00'
date_format = '%Y-%m-%dT%H:%M:%S'
main(start=start, end=end, date_format=date_format, distribution_size=1000, scale_ratio=0.05)
结果:
你也可以像这样混合多个分布:
dist_1 = np.random.normal(loc=mu_1, scale=sigma_1, size=size_1)
dist_2 = np.random.normal(loc=mu_2, scale=sigma_2, size=size_2)
all_distributions = np.concatenate([dist_1, dist_2])