有几种选项:
1. 在拟合模型之前删除聚类变量中的缺失值行。 2. 在拟合模型之后删除具有聚类变量中缺失值的行,然后在计算集群标准误之前删除具有缺失聚类信息的行。 3. 不要删除集群鲁棒的标准误,而是为集群变量中的缺失创建一个额外的组(例如,如果有一个由两个组1和2组成的集群,则将所有NA设置为3)。
哪种方案是最佳实践?例如,R的
multiwayvcov
选择了选项3。以下是简要示例,以澄清我的问题:
library(sandwich)
library(multiwayvcov)
library(lmtest)
data("petersen")
petersen <- petersen[1:200, ]
lm_fit <- lm(y ~ x, data = petersen)
# with multiwayvcov
no_missings_mvcov <- coeftest(lm_fit, cluster.vcov(model = lm_fit, cluster = ~firmid + year))
# with sandwich
no_missings_sw <- coeftest(lm_fit, vcovCL(x = lm_fit,cluster = ~firmid + year ))
petersen[1, "year"] <- NA
petersen[2, "firmid"] <- NA
lm_fit2<- lm(y ~ x, data = petersen)
# with multiwayvcov
missings_mvcov <- coeftest(lm_fit2, cluster.vcov(model = lm_fit2, cluster = ~firmid + year))
# with sandwich
missings_sw <- coeftest(lm_fit2, vcovCL(x = lm_fit2, cluster = ~ firmid + year ))
# Warning messages:
# 1: In rowsum.default(newX[, i], ...) : missing values for 'group'
# 2: In rowsum.default(newX[, i], ...) : missing values for 'group'
# 3: In rowsum.default(newX[, i], ...) : missing values for 'group'
# 4: In rowsum.default(newX[, i], ...) : missing values for 'group'
# compare multiwayvcov
no_missings_mvcov
missings_mvcov
# > no_missings_mvcov
#
# t test of coefficients:
#
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) -0.26823 0.34145 -0.7856 0.4330687
# x 0.91004 0.23183 3.9254 0.0001194 ***
# ---
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#
# > missings_mvcov
#
# t test of coefficients:
#
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) -0.26823 0.33627 -0.7977 0.426
# x 0.91004 0.22839 3.9847 9.493e-05 ***
# ---
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# compare sandwich
no_missings_sw
missings_sw
# > no_missings_sw
#
# t test of coefficients:
#
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) -0.26823 0.34116 -0.7862 0.4326687
# x 0.91004 0.23146 3.9317 0.0001166 ***
# ---
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#
# > missings_sw
#
# t test of coefficients:
#
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) -0.26823 0.33732 -0.7952 0.4274630
# x 0.91004 0.22963 3.9631 0.0001033 ***
# ---
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# A closer look at preprocessing in multiwayvcov and sandwich
preprocess_clusters_mwvcov <- function(model, cluster, debug = FALSE){
if (inherits(cluster, "formula")) {
cluster_tmp <- expand.model.frame(model, cluster, na.expand = FALSE)
cluster <- model.frame(cluster, cluster_tmp, na.action = na.pass)
}
else {
cluster <- as.data.frame(cluster, stringsAsFactors = FALSE)
}
cluster_dims <- ncol(cluster)
tcc <- 2^cluster_dims - 1
acc <- list()
for (i in 1:cluster_dims) {
acc <- append(acc, combn(1:cluster_dims, i, simplify = FALSE))
}
if (debug){print(acc)}
acc <- acc[-1:-cluster_dims]
if(debug){print(acc)}
if (!is.null(model$na.action)) {
if (class(model$na.action) == "exclude") {
cluster <- cluster[-model$na.action, ]
}
else if (class(model$na.action) == "omit") {
cluster <- cluster[-model$na.action, ]
}
cluster <- as.data.frame(cluster)
}
if (debug)
print(class(cluster))
i <- !sapply(cluster, is.numeric)
cluster[i] <- lapply(cluster[i], as.character)
if (cluster_dims > 1) {
for (i in acc) {
cluster <- cbind(cluster, Reduce(paste0, cluster[,
i]))
}
}
cluster
}
# > head(preprocess_clusters_mwvcov(lm_fit, ~firmid + year))
# firmid year Reduce(paste0, cluster[, i])
# 1 1 NA 1NA
# 2 NA 2 NA2
# 3 1 3 13
# 4 1 4 14
# 5 1 5 15
# 6 1 6 16
# > sapply(preprocess_clusters_mwvcov(lm_fit, ~firmid + year), class)
# firmid year Reduce(paste0, cluster[, i])
# "integer" "integer" "factor"
# NA handling in sandwich
preprocess_cluster_sandwich <- function(x, cluster, ...){
if (is.list(x) && !is.null(x$na.action)) class(x$na.action) <- "omit"
ef <- estfun(x, ...)
k <- NCOL(ef)
n <- NROW(ef)
## set up return value with correct dimension and names
rval <- matrix(0, nrow = k, ncol = k,
dimnames = list(colnames(ef), colnames(ef)))
## cluster can either be supplied explicitly or
## be an attribute of the model...FIXME: other specifications?
if (is.null(cluster)) cluster <- attr(x, "cluster")
## resort to cross-section if no clusters are supplied
if (is.null(cluster)) cluster <- 1L:n
## collect 'cluster' variables in a data frame
if(inherits(cluster, "formula")) {
cluster_tmp <- if("Formula" %in% loadedNamespaces()) { ## FIXME to suppress potential warnings due to | in Formula
suppressWarnings(expand.model.frame(x, cluster, na.expand = FALSE))
} else {
expand.model.frame(x, cluster, na.expand = FALSE)
}
cluster <- model.frame(cluster, cluster_tmp, na.action = na.pass)
} else {
cluster <- as.data.frame(cluster)
}
## handle omitted or excluded observations
if((n != NROW(cluster)) && !is.null(x$na.action) && (class(x$na.action) %in% c("exclude", "omit"))) {
cluster <- cluster[-x$na.action, , drop = FALSE]
}
if(NROW(cluster) != n) stop("number of observations in 'cluster' and 'estfun()' do not match")
return(cluster)
}
head(preprocess_cluster_sandwich(lm_fit2, cluster = ~ firmid + year))
# > head(preprocess_cluster_sandwich(lm_fit2, cluster = ~ firmid + year))
# firmid year
# 1 1 NA
# 2 NA 2
# 3 1 3
# 4 1 4
# 5 1 5
# 6 1 6
sapply(head(preprocess_cluster_sandwich(lm_fit2, cluster = ~ firmid + year)), class)
# > sapply(head(preprocess_cluster_sandwich(lm_fit2, cluster = ~ firmid + year)), class)
# firmid year
# "integer" "integer"
正如您所看到的,
lm_fit
和lm_fit1
的标准误差不同,而点估计相同。请注意,´sandwich´返回一个错误消息,这是由于聚类变量中的缺失值导致的。
函数preprocess_clusters_mwvcov
现在收集了multiwayvcov包的聚类预处理。看起来multiwayvcov
在模型拟合后删除了这些缺失的聚类(选项2)。这与Arthur的回答所述的reghdfe
形成对比,后者通过在拟合模型之前删除所有带有缺失聚类变量的观测值来处理聚类中的缺失值(选项1)。R的包中的文档说明“如果模型x中的观测值数量少于原始数据中的数量,因为NA处理,则可以在需要时将相同的NA处理应用于聚类(并且x $ na.action可用)”,但对于聚类变量中的观测值数量少于模型x中的观测值数量的情况没有说明。
assert
语句将失败,因为reghdfe
会将它们删除。 - Arthur Morris