使用数值建立一个表格,将连续变量分成两组。

4

我希望你能帮助我。我正在尝试将一个连续变量分为两组,以下是我想要做的事情的示例:

x=data.frame(v1=c(1,1,2,2,3,4,5,6,9,9,11,2,4,45,67,89,1,1,5,5,5,6,6,6,9,9,9,11,11,8,8,8,51,90,40,15,30,11,8,9,9,1,5,5,100,67,78,98,34,25))

我试图将我的连续变量分成两组,并设置初始值为20,然后:

g1=data.frame(x[x$v1>20,])
g2=data.frame(x[x$v1<=20,])

在计算g1和g2的平均值和标准差后:

mean(g1$x.x.v1...20...)=62.61538
mean(g2$x.x.v1....20...)=6.216216
sd(g1$x.x.v1...20...)=26.80963
sd(g2$x.x.v1....20...)=3.55227
length(g1$x.x.v1...20...)= 13
length(g2$x.x.v1....20...)=37

在此之后,我希望有一张表格,类似于以下内容:
Value   Mean.G1  SD.G1  Mean.G2  SD.G2  N.G1 N.G2
20        62.61    26.8   6.21     3.55  13   37

但这个表格不仅适用于20,我想为一个具有不同值的向量构建该表格,例如一个包含十个元素并从20开始以步长20递增的向量,像这样的向量v=c(10,30,50,70,90,110,130,150,170,190)

我希望这个问题清晰明了。谢谢


+1 是因为你尝试了很多次来改善你的问题。 - agstudy
1
@user2080848,如果您已经确定了一个答案,我建议您接受它,这样问题就会得到回答和关闭(也适用于您的其他答案)。到目前为止,您已经提出了5个问题并接受了1个答案。 - Arun
4个回答

2

我会使用reshape2和plyr这两个库,


library(plyr) ; library(reshape2)
v=c(10,20,30,50,70,90,110,130,150,170,190) # added 20 for checking
# create new dichotomy id variable
l1 = llply(v, function(.v) transform(x, test = x[["v1"]] <= .v))
names(l1) = v # name list elements for later reference
all = melt(l1, id=c("v1","test")) # merge data.frames together
# summarise the data by groups
results = ddply(all, c("L1","test"), summarise, 
          mean = mean(v1), sd=sd(v1), length=length(v1))

导致
arrange(results, as.numeric(L1))

    L1  test      mean        sd length
1   10 FALSE 48.500000 32.505656     18
2   10  TRUE  5.343750  2.902828     32
3   20 FALSE 62.615385 26.809633     13
4   20  TRUE  6.216216  3.552270     37
5   30 FALSE 69.000000 23.870484     11
6   30  TRUE  7.307692  5.907862     39
7   50 FALSE 80.000000 17.270950      8
8   50  TRUE  9.619048 10.245647     42
9   70 FALSE 91.000000  8.717798      5
10  70  TRUE 13.088889 16.555447     45
11  90 FALSE 99.000000  1.414214      2
12  90  TRUE 17.625000 23.951747     48
13 110  TRUE 20.880000 28.456655     50
14 130  TRUE 20.880000 28.456655     50
15 150  TRUE 20.880000 28.456655     50
16 170  TRUE 20.880000 28.456655     50
17 190  TRUE 20.880000 28.456655     50

2
您可以在这里简单使用summarylapply
do.call(rbind,lapply( v,function(x) {
  v1.inf <- summary(v1[v1<=x])
  v1.sup <- summary(v1[v1>x])
  m <- as.matrix(rbind(v1.inf,v1.sup))
  rownames(m) <- paste(x,c('inf','sup'),sep='')
  m
}))

       Min. 1st Qu. Median   Mean 3rd Qu. Max.
10inf     1    2.75    5.0  5.344    8.00    9
10sup    11   17.50   42.5 48.500   75.25  100
20inf     1    4.00    6.0  6.216    9.00   15
20sup    25   40.00   67.0 62.620   89.00  100
30inf     1    4.00    6.0  7.308    9.00   30
30sup    34   48.00   67.0 69.000   89.50  100
50inf     1    4.25    7.0  9.619    9.00   45
50sup    51   67.00   83.5 80.000   92.00  100
70inf     1    5.00    8.0 13.090   11.00   67
70sup    78   89.00   90.0 91.000   98.00  100
90inf     1    5.00    8.0 17.620   12.00   90
90sup    98   98.50   99.0 99.000   99.50  100
110inf    1    5.00    8.5 20.880   22.50  100
110sup   NA      NA     NA    NaN      NA   NA
130inf    1    5.00    8.5 20.880   22.50  100
130sup   NA      NA     NA    NaN      NA   NA
150inf    1    5.00    8.5 20.880   22.50  100
150sup   NA      NA     NA    NaN      NA   NA
170inf    1    5.00    8.5 20.880   22.50  100
170sup   NA      NA     NA    NaN      NA   NA
190inf    1    5.00    8.5 20.880   22.50  100
190sup   NA      NA     NA    NaN      NA   NA

你能否使用摘要函数来消除某些计算?这将非常好,因为在处理大型数据集时,中位数需要很长时间... - Arun

2
第一步可以通过以下方式更经济地完成,并获得相同的结果:
g1=x[x$v1>20,]
g2=x[x$v1<=20,] # since "[" would have returned a dataframe

但为什么不跳过这一步,改为这样做:
do.call(cbind, by(x$v1, list(v1GT20 = x$v1 > 20), 
                      function(v) c(Mean=mean(v), SD=sd(v), N=length(v)) ) )
        FALSE    TRUE
Mean  6.21622 62.6154
SD    3.55227 26.8096
N    37.00000 13.0000

如果您想在多个位置进行裁剪,则使用切割功能来分割和识别组:
do.call(cbind, by(x$v1, cut( x$v1 , breaks=c(10,30,50,70,90,110,130,150,170,190) ), 
                               function(v) c(Mean=mean(v), SD=sd(v), N=length(v)) ) )

      (10,30]  (30,50] (50,70]  (70,90] (90,110]
Mean 16.28571 39.66667 61.6667 85.66667 99.00000
SD    7.93125  5.50757  9.2376  6.65833  1.41421
N     7.00000  3.00000  3.0000  3.00000  2.00000

如果您需要长格式的数据,那么reshape2包中的melt函数是非常有用的。我注意到breaks向量需要一个最小值参数来获取10以下的项目:

> melt( do.call(cbind, by(x$v1,
                          cut( x$v1 , breaks=c(-Inf, 10,30,50,70,90,110,130,150,170,190), 
                                               include.lowest=TRUE ), 
                          function(v) c(Mean=mean(v), SD=sd(v), N=length(v)) ) ) )

   Var1      Var2    value
1  Mean [-Inf,10]  5.34375
2    SD [-Inf,10]  2.90283
3     N [-Inf,10] 32.00000
4  Mean   (10,30] 16.28571
5    SD   (10,30]  7.93125
6     N   (10,30]  7.00000
7  Mean   (30,50] 39.66667
8    SD   (30,50]  5.50757
9     N   (30,50]  3.00000
10 Mean   (50,70] 61.66667
11   SD   (50,70]  9.23760
12    N   (50,70]  3.00000
13 Mean   (70,90] 85.66667
14   SD   (70,90]  6.65833
15    N   (70,90]  3.00000
16 Mean  (90,110] 99.00000
17   SD  (90,110]  1.41421
18    N  (90,110]  2.00000

2

Here's a data.table solution:

require(data.table)
x.dt <- data.table(x)
rbindlist(lapply(v, function(i) {
    lbls <- paste0(c(">", "<="), i)
    x.dt[, grp := as.character(factor(v1 > i, levels=c(TRUE, FALSE), labels=lbls))]
    x.dt[, as.list(c(v = i, mean = mean(v1), 
        sd = sd(v1), length = length(v1))), by = grp]
}))
#       grp   v      mean        sd length
#  1:  <=10  10  5.343750  2.902828     32
#  2:   >10  10 48.500000 32.505656     18
#  3:  <=20  20  6.216216  3.552270     37
#  4:   >20  20 62.615385 26.809633     13
#  5:  <=30  30  7.307692  5.907862     39
#  6:   >30  30 69.000000 23.870484     11
#  7:  <=50  50  9.619048 10.245647     42
#  8:   >50  50 80.000000 17.270950      8
#  9:  <=70  70 13.088889 16.555447     45
# 10:   >70  70 91.000000  8.717798      5
# 11:  <=90  90 17.625000 23.951747     48
# 12:   >90  90 99.000000  1.414214      2
# 13: <=110 110 20.880000 28.456655     50
# 14: <=130 130 20.880000 28.456655     50
# 15: <=150 150 20.880000 28.456655     50
# 16: <=170 170 20.880000 28.456655     50
# 17: <=190 190 20.880000 28.456655     50

网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接