集群标准误差和缺失值

Question

集群标准误差和缺失值

3

我对集群标准误和缺失值有疑问。特别是，我想知道在R和Stata中实现协方差矩阵的集群鲁棒估计器时，如何处理聚类变量存在缺失值但未被包括为回归模型中的协变量的情况。是否有最佳实践方法来解决这个问题？

有几种选项：

1. 在拟合模型之前删除聚类变量中的缺失值行。 2. 在拟合模型之后删除具有聚类变量中缺失值的行，然后在计算集群标准误之前删除具有缺失聚类信息的行。 3. 不要删除集群鲁棒的标准误，而是为集群变量中的缺失创建一个额外的组（例如，如果有一个由两个组1和2组成的集群，则将所有NA设置为3）。

哪种方案是最佳实践？例如，R的multiwayvcov选择了选项3。

以下是简要示例，以澄清我的问题：

library(sandwich)
library(multiwayvcov)
library(lmtest)

data("petersen")
petersen <- petersen[1:200, ]

lm_fit <- lm(y ~ x, data = petersen)
# with multiwayvcov
no_missings_mvcov <- coeftest(lm_fit, cluster.vcov(model = lm_fit, cluster = ~firmid + year))
# with sandwich
no_missings_sw <- coeftest(lm_fit, vcovCL(x = lm_fit,cluster = ~firmid + year ))

petersen[1, "year"] <- NA
petersen[2, "firmid"] <- NA

lm_fit2<- lm(y ~ x, data = petersen)
# with multiwayvcov
missings_mvcov <- coeftest(lm_fit2, cluster.vcov(model = lm_fit2, cluster = ~firmid + year))
# with sandwich
missings_sw <- coeftest(lm_fit2, vcovCL(x = lm_fit2, cluster = ~ firmid + year ))

# Warning messages:
# 1: In rowsum.default(newX[, i], ...) : missing values for 'group'
# 2: In rowsum.default(newX[, i], ...) : missing values for 'group'
# 3: In rowsum.default(newX[, i], ...) : missing values for 'group'
# 4: In rowsum.default(newX[, i], ...) : missing values for 'group'

# compare multiwayvcov
no_missings_mvcov
missings_mvcov
# > no_missings_mvcov
# 
# t test of coefficients:
#   
#   Estimate Std. Error t value  Pr(>|t|)    
# (Intercept) -0.26823    0.34145 -0.7856 0.4330687    
# x            0.91004    0.23183  3.9254 0.0001194 ***
#   ---
#   Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# 
# > missings_mvcov
# 
# t test of coefficients:
#   
#   Estimate Std. Error t value  Pr(>|t|)    
# (Intercept) -0.26823    0.33627 -0.7977     0.426    
# x            0.91004    0.22839  3.9847 9.493e-05 ***
#   ---
#   Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1


# compare sandwich 
no_missings_sw
missings_sw
# > no_missings_sw
# 
# t test of coefficients:
#   
#   Estimate Std. Error t value  Pr(>|t|)    
# (Intercept) -0.26823    0.34116 -0.7862 0.4326687    
# x            0.91004    0.23146  3.9317 0.0001166 ***
#   ---
#   Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# 
# > missings_sw
# 
# t test of coefficients:
#   
#   Estimate Std. Error t value  Pr(>|t|)    
# (Intercept) -0.26823    0.33732 -0.7952 0.4274630    
# x            0.91004    0.22963  3.9631 0.0001033 ***
#   ---
#   Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1


# A closer look at preprocessing in multiwayvcov and sandwich

preprocess_clusters_mwvcov <- function(model, cluster, debug = FALSE){
  
  if (inherits(cluster, "formula")) {
    cluster_tmp <- expand.model.frame(model, cluster, na.expand = FALSE)
    cluster <- model.frame(cluster, cluster_tmp, na.action = na.pass)
  }
  else {
    cluster <- as.data.frame(cluster, stringsAsFactors = FALSE)
  }
  
  cluster_dims <- ncol(cluster)
  tcc <- 2^cluster_dims - 1
  acc <- list()
  
  for (i in 1:cluster_dims) {
    acc <- append(acc, combn(1:cluster_dims, i, simplify = FALSE))
  }
  if (debug){print(acc)}
  
  acc <- acc[-1:-cluster_dims]
  
  if(debug){print(acc)}
  
  if (!is.null(model$na.action)) {
    if (class(model$na.action) == "exclude") {
      cluster <- cluster[-model$na.action, ]
    }
    else if (class(model$na.action) == "omit") {
      cluster <- cluster[-model$na.action, ]
    }
    cluster <- as.data.frame(cluster)
  }
  
  if (debug) 
    print(class(cluster))
  i <- !sapply(cluster, is.numeric)
  cluster[i] <- lapply(cluster[i], as.character)
  if (cluster_dims > 1) {
    for (i in acc) {
      cluster <- cbind(cluster, Reduce(paste0, cluster[, 
                                                       i]))
    }
  }
  cluster
}

# > head(preprocess_clusters_mwvcov(lm_fit, ~firmid + year))
# firmid year Reduce(paste0, cluster[, i])
# 1      1   NA                          1NA
# 2     NA    2                          NA2
# 3      1    3                           13
# 4      1    4                           14
# 5      1    5                           15
# 6      1    6                           16

# > sapply(preprocess_clusters_mwvcov(lm_fit, ~firmid + year), class)
# firmid                         year Reduce(paste0, cluster[, i]) 
# "integer"                    "integer"                     "factor" 


# NA handling in sandwich

preprocess_cluster_sandwich <- function(x, cluster, ...){
  
  if (is.list(x) && !is.null(x$na.action)) class(x$na.action) <- "omit"
  ef <- estfun(x, ...)
  k <- NCOL(ef)
  n <- NROW(ef)
  
  ## set up return value with correct dimension and names
  rval <- matrix(0, nrow = k, ncol = k,
                 dimnames = list(colnames(ef), colnames(ef)))
  
  ## cluster can either be supplied explicitly or
  ## be an attribute of the model...FIXME: other specifications?
  if (is.null(cluster)) cluster <- attr(x, "cluster")
  
  ## resort to cross-section if no clusters are supplied
  if (is.null(cluster)) cluster <- 1L:n
  
  ## collect 'cluster' variables in a data frame
  if(inherits(cluster, "formula")) {
    cluster_tmp <- if("Formula" %in% loadedNamespaces()) { ## FIXME to suppress potential warnings due to | in Formula
      suppressWarnings(expand.model.frame(x, cluster, na.expand = FALSE))
    } else {
      expand.model.frame(x, cluster, na.expand = FALSE)
    }
    cluster <- model.frame(cluster, cluster_tmp, na.action = na.pass)
  } else {
    cluster <- as.data.frame(cluster)
  }
  
  ## handle omitted or excluded observations
  if((n != NROW(cluster)) && !is.null(x$na.action) && (class(x$na.action) %in% c("exclude", "omit"))) {
    cluster <- cluster[-x$na.action, , drop = FALSE]
  }
  
  if(NROW(cluster) != n) stop("number of observations in 'cluster' and 'estfun()' do not match")
  
  return(cluster)    
  
}

head(preprocess_cluster_sandwich(lm_fit2, cluster = ~ firmid + year))
# > head(preprocess_cluster_sandwich(lm_fit2, cluster = ~ firmid + year))
# firmid year
# 1      1   NA
# 2     NA    2
# 3      1    3
# 4      1    4
# 5      1    5
# 6      1    6
sapply(head(preprocess_cluster_sandwich(lm_fit2, cluster = ~ firmid + year)), class)
# > sapply(head(preprocess_cluster_sandwich(lm_fit2, cluster = ~ firmid + year)), class)
# firmid      year 
# "integer" "integer"

正如您所看到的，lm_fit和lm_fit1的标准误差不同，而点估计相同。请注意，´sandwich´返回一个错误消息，这是由于聚类变量中的缺失值导致的。函数preprocess_clusters_mwvcov现在收集了multiwayvcov包的聚类预处理。看起来multiwayvcov在模型拟合后删除了这些缺失的聚类（选项2）。这与Arthur的回答所述的reghdfe形成对比，后者通过在拟合模型之前删除所有带有缺失聚类变量的观测值来处理聚类中的缺失值（选项1）。

R的包中的文档说明“如果模型x中的观测值数量少于原始数据中的数量，因为NA处理，则可以在需要时将相同的NA处理应用于聚类（并且x $ na.action可用）”，但对于聚类变量中的观测值数量少于模型x中的观测值数量的情况没有说明。

- A.Fischer

2个回答

2

策略

我不知道是否有任何正式的证明，但我感觉在一般情况下策略1是唯一可靠的方法。至少当观测值随机缺失时，省略缺失值不应该出现问题（除了可能会损失一些效率）。

策略2似乎存在潜在危险，特别是当有许多缺失值时。如果仅从模型在完整数据集上的一部分得分/残差中计算协方差矩阵，我会惊讶于它们是否保持一致。

策略3可能可以，但这可能取决于具体的应用程序，是否将带有NA的观测值收集到自己的群集中。因此，默认情况下我不会使用此策略。

R实现

关于软件：原则上，实施策略1很简单。然而，在R包sandwich的特定情况下，这并不是这样的。这是由于该包的模块化设计：用户在调用vcovCL()之前拟合模型，只有在meatCL()中检测到问题，而bread()提取器不受NA的影响。因此，我们决定抛出明确的错误消息，指示用户注意处理此问题。

说明

在您的例子中：

coeftest(lm_fit2, vcov = vcovCL, cluster = ~ firmid + year)
## Error in meatCL(x, cluster = cluster, type = type, ...) : 
##   cannot handle NAs in 'cluster': either refit the model without
##   the NA observations in 'cluster' or impute the NAs

因此，您应该执行以下操作：

lm_fit3 <- lm(y ~ x, data = petersen, subset = !is.na(year) & !is.na(firmid))
coeftest(lm_fit3, vcov = vcovCL, cluster = ~ firmid + year)
## t test of coefficients:
## 
##             Estimate Std. Error t value  Pr(>|t|)    
## (Intercept) -0.30132    0.33879 -0.8894    0.3749    
## x            0.93566    0.23158  4.0404 7.659e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

可用性

这种改进的错误处理从sandwich 3.0-1开始提供。目前，最新的开发版本可以从R-Forge获取：install.packages("sandwich", repos="https://R-Forge.R-project.org"). 另请参阅：https://sandwich.R-Forge.R-project.org/news/。

- Achim Zeileis

网页内容由stack overflow 提供, 点击上面的

可以查看英文原文，
原文链接

- Arthur Morris · Accepted Answer

您的编辑让人感觉您更关心sandwich的工作原理。我会保留我的reghdfe答案供参考。

是否有一种最佳实践方法来解决这个问题？

如果您想要在Stata中实现聚类，那么答案是肯定的，而且可能是reghdfe。使用ssc install reghdfe进行安装。

请参阅相关文档链接，以及"单例、集群稳健标准误差和固定效应：一个糟糕的组合"和"具有高维固定效应的线性模型：一种高效且可行的估计器"。

reghdfe采用第一种方法，但您也可以通过为非缺失值分配组（即replace groupid = 9999 if mi(groupid)）来显式实现（3）。

以下验证了reghdfe删除缺失组：

sysuse auto
count // there are 74 obs
count if !mi(rep78) // five are missing the cluster var
local notMissClusterVar = `r(N)'  
reghdfe price weight length, noabsorb cluster(rep78) 
assert `e(N)' == `notMissCluster'

标准的regress命令执行相同的操作。

sysuse auto
count // there are 74 obs
count if !mi(rep78) // five are missing the cluster var
local notMissClusterVar = `r(N)'  
regress price weight length, vce(cluster rep78) 
assert `e(N)' == `notMissCluster'

如果你想知道如何处理缺失聚类变量的观测值，是将其视为单独的聚类还是删除它们，那么我认为默认情况应该是删除它们，但这当然取决于你的数据。请记住，聚类标准误差夹心估计器的假设是有无限组和每组内有有限个观测值（参见Cameron & Trivedi (2005) p. 706-707）。因此，当组数很大且缺少组信息的观测数量很少时，方法3似乎最不具问题。然而，在许多情况下，不知道组可能本身就是不合格的。

在你的例子中，你使用了Petersen (2009)的公司面板数据。他最终建议按照个人（公司）和时间段进行聚类。即使你不想以公司标识符进行聚类，也很难证明包括未知期间或公司的观测是合理的。

当然，你可能处于这样一种情况：推断缺失的观测值来自同一组。在这种情况下，这是一个你必须要提出的论点。