基于其他变量创建新变量(错误:强制转换引入了NA值)

3

我使用的是R编程语言。假设我有以下数据集:

#create data
my_data <- data.frame(


"name" = c("john", "jason", "jack", "jim", "john", "jason", "jack", "jim" ),
"points_1" = c("150", "165", "183", "191", "151", "166", "184", "192"),
"points_2" = c("250", "265", "283", "291", "251", "266", "284", "292")

)


#view data

my_data
   name points_1 points_2
1  john      150      250
2 jason      165      265
3  jack      183      283
4   jim      191      291
5  john      151      251
6 jason      166      266
7  jack      184      284
8   jim      192      292

我正在尝试创建一个最终数据集,其外观如下:

my_data_final <- data.frame(


"name" = c("john", "jason", "jack", "jim", "john", "jason", "jack", "jim" ),
"points_1" = c("150", "165", "183", "191", "151", "166", "184", "192"),
"points_2" = c("250", "265", "283", "291", "251", "266", "284", "292"),
"var1" = c("< 150", "< 165", "< 183", " < 191", " < 151", " < 166", " < 184", "< 192"),
"var2" = c("<150 and > 250", "< 165 and > 265", "<183 and > 283", "< 191 and 291", "<150 and > 250", "< 165 and > 265", "<183 and > 283", "< 191 and 291"),
"var3" = c(">250", ">265", ">283", ">291", ">251", ">266", ">284", ">292")
)

#view desired data


   my_data_final
   name points_1 points_2   var1            var2 var3
1  john      150      250  < 150  <150 and > 250 >250
2 jason      165      265  < 165 < 165 and > 265 >265
3  jack      183      283  < 183  <183 and > 283 >283
4   jim      191      291  < 191   < 191 and 291 >291
5  john      151      251  < 151  <150 and > 250 >251
6 jason      166      266  < 166 < 165 and > 265 >266
7  jack      184      284  < 184  <183 and > 283 >284
8   jim      192      292  < 192   < 191 and 291 >292
我尝试手动创建“迷你变量”,并将它们手动组合成所需的格式:
#example (from here: https://dev59.com/VlfUa4cB1Zd3GeqPIoAy) :

my_data$var_1a = ">"

my_data$var_1 <- as.numeric(paste(my_data$points_1, my_data$var_1a, sep = ""))
但这将导致以下“警告”和NA值:
Warning message:
NAs introduced by coercion 

 my_data
   name points_1 points_2   var1            var2 points_2.1 var_1a var_1
1  john      150      250  < 150  <150 and > 250       >250      >    NA
2 jason      165      265  < 165 < 165 and > 265       >265      >    NA
3  jack      183      283  < 183  <183 and > 283       >283      >    NA
4   jim      191      291  < 191   < 191 and 291       >291      >    NA
5  john      151      251  < 151  <150 and > 250       >251      >    NA
6 jason      166      266  < 166 < 165 and > 265       >266      >    NA
7  jack      184      284  < 184  <183 and > 283       >284      >    NA
8   jim      192      292  < 192   < 191 and 291       >292      >    NA

有人可以向我展示如何修复此错误并创建所需的变量“var1”,“var2”和“var3”吗?

谢谢

3个回答

阿里云服务器只需要99元/年,新老用户同享,点击查看详情
4
我们可以轻松地使用 base R 完成这个操作。使用分隔符在 sprintf 中创建一个表达式,使用 read.csv 读取数据并使用 cbind 与原始数据合并。
my_data <-  cbind(my_data, read.csv(text = with(my_data, 
    sprintf("< %s,<%s and > %s,>%s", points_1, points_1, points_2, points_2)),
   header = FALSE, col.names = paste0("var", 1:3)))

-输出

my_data
   name points_1 points_2  var1           var2 var3
1  john      150      250 < 150 <150 and > 250 >250
2 jason      165      265 < 165 <165 and > 265 >265
3  jack      183      283 < 183 <183 and > 283 >283
4   jim      191      291 < 191 <191 and > 291 >291
5  john      151      251 < 151 <151 and > 251 >251
6 jason      166      266 < 166 <166 and > 266 >266
7  jack      184      284 < 184 <184 and > 284 >284
8   jim      192      292 < 192 <192 and > 292 >292

就像@thelatemail建议的那样,我们可以使用位置%1%2来避免重复相同的参数。

my_data <- cbind(my_data, read.csv(text = with(my_data, 
    sprintf("<%1$s ,<%1$s and >%2$s, >%2$s", points_1, points_2)),
 header = FALSE, col.names = paste0("var", 1:3)))
或者选项使用 glueseparate
library(dplyr)
library(tidyr)
my_data %>%
    mutate(vars = glue::glue("<{points_1},<{points_1} and >{points_2},>{points_2}")) %>% 
    separate(vars, into = c("var1", "var2", "var3"), sep = ",")
   name points_1 points_2 var1          var2 var3
1  john      150      250 <150 <150 and >250 >250
2 jason      165      265 <165 <165 and >265 >265
3  jack      183      283 <183 <183 and >283 >283
4   jim      191      291 <191 <191 and >291 >291
5  john      151      251 <151 <151 and >251 >251
6 jason      166      266 <166 <166 and >266 >266
7  jack      184      284 <184 <184 and >284 >284
8   jim      192      292 <192 <192 and >292 >292

2
你可以通过按数字调用第一个和第二个参数来简化并使sprintf更加明确 - 例如:with(my_data, sprintf("<%1$s ,<%1$s and >%2$s, >%2$s", points_1, points_2)) - thelatemail

3

这是您正在寻找的解决方案吗?

library(tidyverse)

my_data <- data.frame(
  "name" = c("john", "jason", "jack", "jim", "john", "jason", "jack", "jim" ),
  "points_1" = c("150", "165", "183", "191", "151", "166", "184", "192"),
  "points_2" = c("250", "265", "283", "291", "251", "266", "284", "292")
)

my_data %>% 
  mutate(var1 = paste("<", as.character(points_1), sep = ""),
         var2 = paste(">", as.character(points_1), " and ",
                      "<", as.character(points_2), sep = ""),
         var3 = paste(">", as.character(points_2), sep = ""))
#>    name points_1 points_2 var1          var2 var3
#> 1  john      150      250 <150 >150 and <250 >250
#> 2 jason      165      265 <165 >165 and <265 >265
#> 3  jack      183      283 <183 >183 and <283 >283
#> 4   jim      191      291 <191 >191 and <291 >291
#> 5  john      151      251 <151 >151 and <251 >251
#> 6 jason      166      266 <166 >166 and <266 >266
#> 7  jack      184      284 <184 >184 and <284 >284
#> 8   jim      192      292 <192 >192 and <292 >292

使用基础R:

my_data <- data.frame(
  "name" = c("john", "jason", "jack", "jim", "john", "jason", "jack", "jim" ),
  "points_1" = c(150, 165, 183, 191, 151, 166, 184, 192),
  "points_2" = c(250, 265, 283, 291, 251, 266, 284, 292)
)

my_data$var1 <- paste("<", my_data$points_1, sep = "")
my_data$var2 <- paste(">", my_data$points_1, " and ", "<", my_data$points_2, sep = "")
my_data$var3 <- paste(">", my_data$points_2, sep = "")
my_data
#>    name points_1 points_2 var1          var2 var3
#> 1  john      150      250 <150 >150 and <250 >250
#> 2 jason      165      265 <165 >165 and <265 >265
#> 3  jack      183      283 <183 >183 and <283 >283
#> 4   jim      191      291 <191 >191 and <291 >291
#> 5  john      151      251 <151 >151 and <251 >251
#> 6 jason      166      266 <166 >166 and <266 >266
#> 7  jack      184      284 <184 >184 and <284 >284
#> 8   jim      192      292 <192 >192 and <292 >292

谢谢您的回答!我正在使用一家没有互联网或USB端口的公司。我有一个较旧版本的R和一些基本库(例如base R,dplyr,data.table...但没有tidyr,tidyverse等)。是否有更基本的方法来完成这个任务?谢谢! - stats_noob
是的 - 我用基本的R方法更新了我的答案。 - jared_mamrot

3

你也可以使用以下解决方案,使用purrr中的map2函数:

library(purrr)
library(tidyr)

my_data %>%
  mutate(output = map2(my_data$points_1, my_data$points_2, ~ {
    tibble(var1 = paste("< ", .x),
           var2 = paste("< ", .x, " and ", "> ", .y),
           var3 = paste("> ", .y))
  })) %>%
  unnest(output)

# A tibble: 8 x 6
  name  points_1 points_2 var1   var2                var3  
  <chr> <chr>    <chr>    <chr>  <chr>               <chr> 
1 john  150      250      <  150 <  150  and  >  250 >  250
2 jason 165      265      <  165 <  165  and  >  265 >  265
3 jack  183      283      <  183 <  183  and  >  283 >  283
4 jim   191      291      <  191 <  191  and  >  291 >  291
5 john  151      251      <  151 <  151  and  >  251 >  251
6 jason 166      266      <  166 <  166  and  >  266 >  266
7 jack  184      284      <  184 <  184  and  >  284 >  284
8 jim   192      292      <  192 <  192  and  >  292 >  292

网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,