Julia:数据框元转换

4

我正在尝试在Julia中复制以下的R代码

library(dplyr)

women_new <- rbind(women, c(NA, 1), c(NA, NA))
women_new %>% 
  filter(height %>% complete.cases) %>%
  mutate(sector = character(n()),
         sector = replace(sector, height >= 0 & height <= 60, "1"),
         sector = replace(sector, height >= 61 & height <= 67, "2"), 
         sector = replace(sector, height >= 68 & height <= 72, "3"))

我的 Julia 尝试如下:

using DataFrames
using DataFramesMeta
using Lazy
using RDatasets

women = @> begin
  "datasets" 
  dataset("women")
  DataArray()
  vcat([[NA NA]; [NA NA]])
end

women_new = DataFrame(Height = women[:, 1], Weight = women[:, 2]);
women_new[16, 2] = 1;

我的第一个问题是,是否有一种方法可以像在R中那样立即输入1vcat([[NA 1]; [NA NA]])中?如果这样做,它会返回以下错误:

MethodError: Cannot `convert` an object of type DataArrays.NAtype to an object of type Int64
This may have arisen from a call to the constructor Int64(...),
since type constructors fall back to convert methods.
 in macro expansion at multidimensional.jl:431 [inlined]
 in macro expansion at cartesian.jl:64 [inlined]
 in macro expansion at multidimensional.jl:429 [inlined]
 in _unsafe_batchsetindex!(::Array{Int64,2}, ::Base.Repeated{DataArrays.NAtype}, ::UnitRange{Int64}, ::UnitRange{Int64}) at multidimensional.jl:421
 in setindex!(::Array{Int64,2}, ::DataArrays.NAtype, ::UnitRange{Int64}, ::UnitRange{Int64}) at abstractarray.jl:832
 in cat_t(::Int64, ::Type{T}, ::DataArrays.NAtype, ::Vararg{Any,N}) at abstractarray.jl:1098
 in hcat(::DataArrays.NAtype, ::Int64) at abstractarray.jl:1180
 in include_string(::String, ::String) at loading.jl:441
 in include_string(::String, ::String, ::Int64) at eval.jl:30
 in include_string(::Module, ::String, ::String, ::Int64, ::Vararg{Int64,N}) at eval.jl:34
 in (::Atom.##53#56{String,Int64,String})() at eval.jl:50
 in withpath(::Atom.##53#56{String,Int64,String}, ::String) at utils.jl:30
 in withpath(::Function, ::String) at eval.jl:38
 in macro expansion at eval.jl:49 [inlined]
 in (::Atom.##52#55{Dict{String,Any}})() at task.jl:60

我的第二个问题是,是否有一种方法将DataArray转换为DataFrame? 在这种情况下,列名变成X1X2...DataFrame中的任何默认名称,因为DataArray没有列名。 我认为这比输入以下内容更整洁:

women_new = DataFrame(Height = women[:, 1], Weight = women[:, 2]);

我希望我可以简单地执行 convert(DataFrame, women) 并简单地重命名列名。但是这种转换不起作用。接下来是我在 R 中尝试的转换或变异的方式。
@> begin
  women_new
  @where !isna(:Height)
  @transform(
    Sector = NA,
    Sector = ifelse(:Height .>=  0 & :Height .<= 60, 1,
             ifelse(:Height .>= 61 & :Height .<= 67, 2,
             ifelse(:Height .>= 68 & :Height .<= 72, 3, NA)))
    )
end

但是这将返回:
15×3 DataFrames.DataFrame
│ Row │ Height │ Weight │ Sector│
├─────┼────────┼────────┼───────┤
│ 1   │ 58     │ 115    │ 1     │
│ 2   │ 59     │ 117    │ 1     │
│ 3   │ 60     │ 120    │ 1     │
│ 4   │ 61     │ 123    │ 1     │
│ 5   │ 62     │ 126    │ 1     │
│ 6   │ 63     │ 129    │ 1     │
│ 7   │ 64     │ 132    │ 1     │
│ 8   │ 65     │ 135    │ 1     │
│ 9   │ 66     │ 139    │ 1     │
│ 10  │ 67     │ 142    │ 1     │
│ 11  │ 68     │ 146    │ 1     │
│ 12  │ 69     │ 150    │ 1     │
│ 13  │ 70     │ 154    │ 1     │
│ 14  │ 71     │ 159    │ 1     │
│ 15  │ 72     │ 164    │ 1     │

这与R不等价,我也尝试了以下方法:

@> begin
  women_new
  @where !isna(:Height)
  @transform(
    Sector = NA,
    Sector = :Height .>=  0 & :Height .<= 60 ? 1 :
             :Height .>= 61 & :Height .<= 67 ? 2 :
             :Height .>= 68 & :Height .<= 72 ? 3 :
            NA;
    )
end

但是返回以下错误:
TypeError: non-boolean (DataArrays.DataArray{Bool,1}) used in boolean context
 in (::###469#303)(::DataArrays.DataArray{Int64,1}) at DataFramesMeta.jl:55
 in (::##298#302)(::DataFrames.DataFrame) at DataFramesMeta.jl:295
 in #transform#38(::Array{Any,1}, ::Function, ::DataFrames.DataFrame) at DataFramesMeta.jl:270
 in (::DataFramesMeta.#kw##transform)(::Array{Any,1}, ::DataFramesMeta.#transform, ::DataFrames.DataFrame) at <missing>:0
 in include_string(::String, ::String) at loading.jl:441
 in include_string(::String, ::String, ::Int64) at eval.jl:30
 in include_string(::Module, ::String, ::String, ::Int64, ::Vararg{Int64,N}) at eval.jl:34
 in (::Atom.##53#56{String,Int64,String})() at eval.jl:50
 in withpath(::Atom.##53#56{String,Int64,String}, ::String) at utils.jl:30
 in withpath(::Function, ::String) at eval.jl:38
 in macro expansion at eval.jl:49 [inlined]
 in (::Atom.##52#55{Dict{String,Any}})() at task.jl:60

我很感激您能帮助我弄清楚这个问题。我的最后一个问题是:是否有一种像R那样简洁优雅的方式来缩短我的代码?


1
如果你在这里没有得到任何回应,你可能想去 Julia Discourse 上问问。我知道很多“数据人员”经常会去那里。但一定要提及这是一个跨贴。 - Chris Rackauckas
1个回答

3
我明白了。操作符优先级会产生影响,我原以为不需要括号。
using DataFrames
using DataFramesMeta
using Lazy
using RDatasets

women = dataset("datasets", "women");
women_new = vcat(
              women,
              DataFrame(Height = [NA; NA], Weight = @data [1; NA])
            )

@> begin
  women_new
  @where !isna(:Height)
  @transform(
    Class = NA,
    Class = ifelse((:Height .>=  0) & (:Height .<= 60), 1,
            ifelse((:Height .>= 61) & (:Height .<= 67), 2,
            ifelse((:Height .>= 68) & (:Height .<= 72), 3, NA)))
            )
end

更新:以上代码可以进一步简化为:

@> begin
  women_new
  @where !isna(:Height)
  @transform(
    Class = @> begin
      function (x)
         0 <= x <= 60 ?  1 :
        61 <= x <= 67 ?  2 :
        68 <= x <= 72 ?  3 :
        NA
      end
      map(:Height)
    end
  )
end

或者另一种选择是使用Query.jl,具体操作如下:

using DataFrames
using Query
using RDatasets

women = dataset("datasets", "women");
women_new = vcat(
              women,
              DataFrame(Height = [NA; NA], Weight = @data [1; NA])
            )

@from i in women_new begin
    @where !isnull(i.Height)
    @select {
        i.Height, i.Weight,
        class = 0 <= i.Height <= 60 ?  1 :
               61 <= i.Height <= 67 ?  2 :
               68 <= i.Height <= 72 ?  3 :
                0
    }
    @collect DataFrame
end

输出现在正确:
15×3 DataFrames.DataFrame
│ Row │ Height │ Weight │ Class │
├─────┼────────┼────────┼───────┤
│ 1   │ 58     │ 115    │ 1     │
│ 2   │ 59     │ 117    │ 1     │
│ 3   │ 60     │ 120    │ 1     │
│ 4   │ 61     │ 123    │ 2     │
│ 5   │ 62     │ 126    │ 2     │
│ 6   │ 63     │ 129    │ 2     │
│ 7   │ 64     │ 132    │ 2     │
│ 8   │ 65     │ 135    │ 2     │
│ 9   │ 66     │ 139    │ 2     │
│ 10  │ 67     │ 142    │ 2     │
│ 11  │ 68     │ 146    │ 3     │
│ 12  │ 69     │ 150    │ 3     │
│ 13  │ 70     │ 154    │ 3     │
│ 14  │ 71     │ 159    │ 3     │
│ 15  │ 72     │ 164    │ 3     │

如果我们不想过滤缺失值并使用完整数据,则最好的方法是以下内容:
@> begin
  women_new
  @transform(
    Height_New = NA,
    Height_New = ifelse(isna(:Height), -1, :Height))
  @transform(
    Class = NA,
    Class = ifelse(:Height_New == -1, NA,
              ifelse((:Height_New .>=  0) & (:Height_New .<= 60), 1,
              ifelse((:Height_New .>= 61) & (:Height_New .<= 67), 2,
              ifelse((:Height_New .>= 68) & (:Height_New .<= 72), 3, NA))))
  )
  delete!(:Height_New)
end

更新: 上述代码可以进一步简化为:

@> begin
    women_new
    @transform(
        Class = @> begin
            function (x)
                isna(x)       ? NA :
                 0 <= x <= 60 ?  1 :
                61 <= x <= 67 ?  2 :
                68 <= x <= 72 ?  3 :
                NA
            end
            map(:Height)
        end
    )
end

另一种选择是使用Query.jl,操作如下:

@from i in women_new begin
    @select {
        i.Height, i.Weight,
        class = 0 <= i.Height <= 60 ?  1 :
               61 <= i.Height <= 67 ?  2 :
               68 <= i.Height <= 72 ?  3 :
                0
    }
    @collect DataFrame
end

输出结果:
17×3 DataFrames.DataFrame
│ Row │ Height │ Weight │ Class │
├─────┼────────┼────────┼───────┤
│ 1   │ 58     │ 115    │ 1     │
│ 2   │ 59     │ 117    │ 1     │
│ 3   │ 60     │ 120    │ 1     │
│ 4   │ 61     │ 123    │ 2     │
│ 5   │ 62     │ 126    │ 2     │
│ 6   │ 63     │ 129    │ 2     │
│ 7   │ 64     │ 132    │ 2     │
│ 8   │ 65     │ 135    │ 2     │
│ 9   │ 66     │ 139    │ 2     │
│ 10  │ 67     │ 142    │ 2     │
│ 11  │ 68     │ 146    │ 3     │
│ 12  │ 69     │ 150    │ 3     │
│ 13  │ 70     │ 154    │ 3     │
│ 14  │ 71     │ 159    │ 3     │
│ 15  │ 72     │ 164    │ 3     │
│ 16  │ NA     │ 1      │ NA    │
│ 17  │ NA     │ NA     │ NA    │

在这种情况下,代码变得混乱,因为目前还没有处理ifelse的第一个参数中NAs的方法。

网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接