按天和组别展示汇总统计数据

4
假设有以下数据框:
head(df, 9)
         Day               variable     value
1 2015-10-18   Number_Flows.minimum  401.0000
2 2015-10-18   Number_Flows.maximum 2068.0000
3 2015-10-18   Number_Flows.average 1578.9474
4 2015-10-18 Number_srcaddr.minimum   95.0000
5 2015-10-18 Number_srcaddr.maximum  292.0000
6 2015-10-18 Number_srcaddr.average  222.6316
7 2015-10-18 Number_dstaddr.minimum   65.0000
8 2015-10-18 Number_dstaddr.maximum  411.0000
9 2015-10-18 Number_dstaddr.average  202.5789

我想做的是为每个“Number_Flows”、“Number_srcaddr”等绘制“最小值”、“最大值”和“平均值”。我希望用柱状图显示值,但也可以采用其他方法,只要我得到(例如下面发布的可重复示例)总共22个图表(每天11个)。
我尝试了各种方法,但没有成功。
library(dplyr)
library(ggplot2)


ggplot(df %>% mutate(group = paste(Day, gsub('\\..*', '', variable), sep = '-')), aes(x = Day, y = value))+geom_bar(stat = 'identity')+facet_wrap(~group)
ggplot(df %>% mutate(group = paste(Day, gsub('\\..*', '', variable), sep = '-')), aes(x = Day, y = value))+geom_bar(stat = 'identity')+facet_wrap(~group)
ggplot(df %>% mutate(group = paste(Day, gsub('\\..*', '', variable), sep = '-')), aes(x = Day, y = value))+geom_line()+facet_wrap(~group)

数据

dput(df)
structure(list(Day = structure(c(1445115600, 1445115600, 1445115600, 
1445115600, 1445115600, 1445115600, 1445115600, 1445115600, 1445115600, 
1445115600, 1445115600, 1445115600, 1445115600, 1445115600, 1445115600, 
1445115600, 1445115600, 1445115600, 1445115600, 1445115600, 1445115600, 
1445115600, 1445115600, 1445115600, 1445115600, 1445115600, 1445115600, 
1445115600, 1445115600, 1445115600, 1445115600, 1445115600, 1445115600, 
1445202000, 1445202000, 1445202000, 1445202000, 1445202000, 1445202000, 
1445202000, 1445202000, 1445202000, 1445202000, 1445202000, 1445202000, 
1445202000, 1445202000, 1445202000, 1445202000, 1445202000, 1445202000, 
1445202000, 1445202000, 1445202000, 1445202000, 1445202000, 1445202000, 
1445202000, 1445202000, 1445202000, 1445202000, 1445202000, 1445202000, 
1445202000, 1445202000, 1445202000), class = c("POSIXct", "POSIXt"
), tzone = ""), variable = c("Number_Flows.minimum", "Number_Flows.maximum", 
"Number_Flows.average", "Number_srcaddr.minimum", "Number_srcaddr.maximum", 
"Number_srcaddr.average", "Number_dstaddr.minimum", "Number_dstaddr.maximum", 
"Number_dstaddr.average", "Sum_packets.minimum", "Sum_packets.maximum", 
"Sum_packets.average", "Sum_duration_nannosecs.minimum", "Sum_duration_nannosecs.maximum", 
"Sum_duration_nannosecs.average", "Average_Duration.minimum", 
"Average_Duration.maximum", "Average_Duration.average", "Average_Bytes.minimum", 
"Average_Bytes.maximum", "Average_Bytes.average", "Bytes_per_packet.minimum", 
"Bytes_per_packet.maximum", "Bytes_per_packet.average", "Sum_of_Bytes.minimum", 
"Sum_of_Bytes.maximum", "Sum_of_Bytes.average", "Actual_Batch_Duration_secs.minimum", 
"Actual_Batch_Duration_secs.maximum", "Actual_Batch_Duration_secs.average", 
"packets_per_second.minimum", "packets_per_second.maximum", "packets_per_second.average", 
"Number_Flows.minimum", "Number_Flows.maximum", "Number_Flows.average", 
"Number_srcaddr.minimum", "Number_srcaddr.maximum", "Number_srcaddr.average", 
"Number_dstaddr.minimum", "Number_dstaddr.maximum", "Number_dstaddr.average", 
"Sum_packets.minimum", "Sum_packets.maximum", "Sum_packets.average", 
"Sum_duration_nannosecs.minimum", "Sum_duration_nannosecs.maximum", 
"Sum_duration_nannosecs.average", "Average_Duration.minimum", 
"Average_Duration.maximum", "Average_Duration.average", "Average_Bytes.minimum", 
"Average_Bytes.maximum", "Average_Bytes.average", "Bytes_per_packet.minimum", 
"Bytes_per_packet.maximum", "Bytes_per_packet.average", "Sum_of_Bytes.minimum", 
"Sum_of_Bytes.maximum", "Sum_of_Bytes.average", "Actual_Batch_Duration_secs.minimum", 
"Actual_Batch_Duration_secs.maximum", "Actual_Batch_Duration_secs.average", 
"packets_per_second.minimum", "packets_per_second.maximum", "packets_per_second.average"
), value = c(401, 2068, 1578.94736842105, 95, 292, 222.631578947368, 
65, 411, 202.578947368421, 4181, 130567, 33860.2631578947, 2647278, 
10876533, 5438303.63157895, 1543.937984, 20335.58603, 4202.062837, 
692.4193548, 77207.90476, 14689.4305788105, 231.6654261, 943.7592654, 
465.315475931579, 1244970, 123223816, 19865244, 9, 30, 27.1578947368421, 
179, 4352, 1265.94736842105, 609, 2352, 1578.94736842105, 89, 
299, 219.105263157895, 92, 402, 193.578947368421, 1124, 60473, 
19022.6842105263, 944317, 20088618, 5254959.84210526, 1550.602627, 
9749.356239, 3236.99523905263, 258.9441708, 17451.96293, 5789.86937011053, 
140.2998221, 717.4807734, 424.926870810526, 157697, 33505216, 
9510806.21052632, 5, 30, 24.9473684210526, 114, 2179, 772.947368421053
)), .Names = c("Day", "variable", "value"), row.names = c(NA, 
66L), class = "data.frame")
4个回答

7

我喜欢使用线来展示时间趋势,使用带状图来显示数值范围。

和@docendo类似,我会先分离,然后再展开

library(tidyverse)

df %>%
  separate(variable, c("type", "var"), sep = "\\.") %>% 
  spread(var, value) %>% 
  ggplot(aes(Day)) +
  geom_line(aes(y = average), size = 1) +
  geom_ribbon(aes(ymin = minimum, ymax = maximum), alpha = 0.2) +
  facet_wrap(~type, scales = 'free_y') +
  theme(axis.text.x=element_text(angle = 90, vjust = 0.5))

这里输入图片描述

如果您有更多的时间,这将会看起来更好。


5

我建议先在绘图前将“变量”列分离:

library(dplyr)
library(ggplot2)
library(tidyr)

df %>% 
  separate(variable, c("type", "var"), sep = "\\.") %>% 
  ggplot(aes(x = Day, y = value, color = var)) +
  geom_point() +
  facet_wrap(~type) + 
  theme(axis.text.x=element_text(angle = -90, hjust = 0))

Imgur

您可以使用免费的y轴刻度、条形图等方式,使这个更加详细和易懂。


这太棒了。那3个额外的值是如何出现在x轴上的? - Sotos
1
@Sotos,这是一个连续的比例尺,就像绘制x值为1和5时一样,ggplot也会将2、3和4作为间断点。 - Axeman
啊,好的。有道理。 - Sotos
3
没问题,@Axeman。如果你不喜欢这样做,可以手动设置x轴标签或使用 ggplot(aes(x = as.character(Day), y = value, color = var)) - talat
我不介意。只是需要了解它来自哪里。 - Sotos

4

一个基于R语言的解决方案(或者准确地说,它使用了reshape2):

首先你需要创建变量"type""stat",然后将你的data.frame按天数split,接着reshape你的数据框以获得所需的形状,最后用barplot绘制它(我让你自定义barplot)。你可以通过修改lapply调用的方式来保留日期(并在后面用作主标题),使用list的名称。

df$type <- sub("([^.]+)\\..+", "\\1", df$variable)
df$stat <- sub("[^.]+\\.(.+)", "\\1", df$variable)

l_df <- split(df, df$Day)
library(reshape2)
par(mfrow=c(2, 1))
lapply(l_df, function(df_day){
                df_resh <- dcast(type~stat, value.var="value", data=df_day)
                row.names(df_resh) <- df_resh$type
                barplot(t(df_resh[, -1]), beside=TRUE, legend=TRUE, col=c("green", "blue", "red"))})

enter image description here


非常好!所以 par(mfrow=c(2, 1)) 只是告诉它创建两行,每行一个图表。 - Sotos
@Sotos 是的,par(mfrow=c(nrow, ncol)) 可以将你的窗口分成 nrow 行和 ncol 列,然后在每个部分绘制一个图。 - Cath

3
你可以尝试一下。
library(stringr)
df$var1 <-  unlist(lapply(str_split(df$variable, "[.]"), "[", 1))
df$var2 <-  unlist(lapply(str_split(df$variable, "[.]"), "[", 2))  
ggplot(df, aes( x=var2, y= value)) + geom_bar(stat = 'identity') + facet_wrap(var1 ~ Day, scales = "free_y")

enter image description here


网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接