从gamlss R对象预测二元响应概率

3

我希望能够通过gamlss R函数预测二进制类别的概率/类别标签,如何使用predict函数来实现?

以下是我的样例代码:

library(gamlss)
X1 <- rnorm(500)
X2 <- sample(c("A","C","D","E"),500, replace = TRUE)
Y <- ifelse(X1>0.2& X2=="A",1,0)

n <- 500
training <- sample(1:n, 400)
testing <- (1:n)[-training]

fit <- gamlss(Y[training]~pcat(X2[training],Lp=1)+ri(X1[training],Lp=1),family=BI())

pred <- predict(fit,newdata = data.frame(X1,X2)[testing,],type = "response")

在使用选项data定义原始数据时,predict.gamlss(fit, newdata = data.frame(X1, X2)[testing, ])出现错误。

有任何想法吗?

1个回答

2

您需要使用 gamlssdata 选项来定义原始数据:

library(gamlss)
set.seed(1)
n <- 500
X1 <- rnorm(n)
X2 <- sample(c("A","C","D","E"), n, replace = TRUE)
Y <- ifelse(X1>0.2 & X2=="A", 1, 0)
dtset <- data.frame(X1, X2, Y)

training <- sample(1:n, 400)
XYtrain <- dtset[training,]
XYtest  <- dtset[-training,]

fit <- gamlss(Y ~ pcat(X2, Lp=1) + ri(X1, Lp=1), family=BI(), data=XYtrain)
pred <- predict(fit, type="response", newdata=XYtest)

很不幸,predict现在生成了一个新的错误消息:

Error in if (p != ap) stop("the dimensions of the penalty matrix and of the design matrix are incompatible") : argument is of length zero

可以通过修改predict.gamlss使用的gamlss.ri函数来解决这个问题:

gamlss.ri <- function (x, y, w, xeval = NULL, ...) 
{
    regpen <- function(sm, D, P0, lambda) {
        for (it in 1:iter) {
            RD <- rbind(R, sqrt(lambda) * sqrt(omega.) * D)
            svdRD <- svd(RD)
            rank <- sum(svdRD$d > max(svdRD$d) * .Machine$double.eps^0.8)
            np <- min(p, N)
            U1 <- svdRD$u[1:np, 1:rank]
            y1 <- t(U1) %*% Qy
            beta <- svdRD$v[, 1:rank] %*% (y1/svdRD$d[1:rank])
            dm <- max(abs(sm - beta))
            sm <- beta
            omega. <- c(1/(abs(sm)^(2 - Lp) + kappa^2))
            if (dm < c.crit) 
                break
        }
        HH <- (svdRD$u)[1:p, 1:rank] %*% t(svdRD$u[1:p, 1:rank])
        edf <- sum(diag(HH))
        fv <- X %*% beta
        row.names(beta) <- namesX
        out <- list(fv = fv, beta = beta, edf = edf, omega = omega.)
    }
    fnGAIC <- function(lambda, k) {
        fit <- regpen(sm, D, P0, lambda)
        fv <- fit$fv
        GAIC <- sum(w * (y - fv)^2) + k * fit$edf
        GAIC
    }
    X <- if (is.null(xeval)) 
        as.matrix(attr(x, "X"))
    else as.matrix(attr(x, "X"))[seq(1, length(y)), , drop=FALSE] # Added drop=FALSE
    namesX <- as.character(attr(x, "namesX"))
    D <- as.matrix(attr(x, "D"))
    order <- as.vector(attr(x, "order"))
    lambda <- as.vector(attr(x, "lambda"))
    df <- as.vector(attr(x, "df"))
    Lp <- as.vector(attr(x, "Lp"))
    kappa <- as.vector(attr(x, "kappa"))
    iter <- as.vector(attr(x, "iter"))
    k <- as.vector(attr(x, "k"))
    c.crit <- as.vector(attr(x, "c.crit"))
    method <- as.character(attr(x, "method"))
    gamlss.env <- as.environment(attr(x, "gamlss.env"))
    startLambdaName <- as.character(attr(x, "NameForLambda"))
    N <- sum(w != 0)
    n <- nrow(X)
    p <- ncol(X)
    aN <- nrow(D)
    ap <- ncol(D)
    qrX <- qr(sqrt(w) * X, tol = .Machine$double.eps^0.8)
    R <- qr.R(qrX)
    Q <- qr.Q(qrX)
    Qy <- t(Q) %*% (sqrt(w) * y)
    if (p != ap) 
        stop("the dimensions of the penalty matrix and of the design matrix are incompatible")
    P0 <- diag(p) * 1e-06
    sm <- rep(0, p)
    omega. <- rep(1, p)
    tau2 <- sig2 <- NULL
    lambdaS <- get(startLambdaName, envir = gamlss.env)
    if (lambdaS >= 1e+07) 
        lambda <- 1e+07
    if (lambdaS <= 1e-07) 
        lambda <- 1e-07
    if (is.null(df) && !is.null(lambda) || !is.null(df) && !is.null(lambda)) {
        fit <- regpen(sm, D, P0, lambda)
        fv <- fit$fv
    }
    else if (is.null(df) && is.null(lambda)) {
        lambda <- lambdaS
        switch(method, ML = {
            for (it in 1:20) {
                fit <- regpen(sm, D, P0, lambda)
                gamma. <- D %*% as.vector(fit$beta) * sqrt(fit$omega)
                fv <- X %*% fit$beta
                sig2 <- sum(w * (y - fv)^2)/(N - fit$edf)
                tau2 <- sum(gamma.^2)/(fit$edf - order)
                lambda.old <- lambda
                lambda <- sig2/tau2
                if (abs(lambda - lambda.old) < 1e-04 || lambda > 
                  1e+05) break
            }
        }, GAIC = {
            lambda <- nlminb(lambda, fnGAIC, lower = 1e-07, upper = 1e+07, 
                k = k)$par
            fit <- regpen(sm, D, P0, lambda)
            fv <- fit$fv
            assign(startLambdaName, lambda, envir = gamlss.env)
        }, )
    }
    else {
        edf1_df <- function(lambda) {
            edf <- sum(1/(1 + lambda * UDU$values))
            (edf - df)
        }
        Rinv <- solve(R)
        S <- t(D) %*% D
        UDU <- eigen(t(Rinv) %*% S %*% Rinv)
        lambda <- if (sign(edf1_df(0)) == sign(edf1_df(1e+05))) 
            1e+05
        else uniroot(edf1_df, c(0, 1e+05))$root
        fit <- regpen(sm, D, P0, lambda)
        fv <- fit$fv
    }
    waug <- as.vector(c(w, rep(1, nrow(D))))
    xaug <- as.matrix(rbind(X, sqrt(lambda) * D))
    lev <- hat(sqrt(waug) * xaug, intercept = FALSE)[1:n]
    var <- lev/w
    coefSmo <- list(coef = fit$beta, lambda = lambda, edf = fit$edf, 
        sigb2 = tau2, sige2 = sig2, sigb = if (is.null(tau2)) NA else sqrt(tau2), 
        sige = if (is.null(sig2)) NA else sqrt(sig2), fv = as.vector(fv), 
        se = sqrt(var), Lp = Lp)
    class(coefSmo) <- "ri"
    if (is.null(xeval)) {
        list(fitted.values = as.vector(fv), residuals = y - fv, 
            var = var, nl.df = fit$edf - 1, lambda = lambda, 
            coefSmo = coefSmo)
    }
    else {
        ll <- dim(as.matrix(attr(x, "X")))[1]
        nx <- as.matrix(attr(x, "X"))[seq(length(y) + 1, ll), 
            ]
        pred <- drop(nx %*% fit$beta)
        pred
    }
}
# Replace "gamlss.ri" in the package "gamlss"
assignInNamespace("gamlss.ri", gamlss.ri, pos="package:gamlss")

pred <- predict(fit, type="response", newdata=XYtest)

print(head(pred))
# [1] 2.220446e-16 2.220446e-16 2.220446e-16 4.142198e-12 2.220446e-16 2.220446e-16

马可,我可以问一下为什么会发生这种情况吗? - Khan
1
@Khan gamlss.ri 中有一个小错误:当您输入单个协变量时,命令(在 gamlss.ri 内部)as.matrix(attr(x, "X"))[seq(1, length(y)), ] 会给出一个对象 num,而不是矩阵。我添加了 as.matrix(attr(x, "X"))[seq(1, length(y)), drop=FALSE] - Marco Sandri

网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接