我试图在Julia中重现以下R代码
library(dplyr)
women_new <- rbind(women, c(NA, 1), c(NA, NA))
women_new %>%
filter(height %>% complete.cases) %>%
mutate(sector = character(n()),
sector = replace(sector, height >= 0 & height <= 60, "1"),
sector = replace(sector, height >= 61 & height <= 67, "2"),
sector = replace(sector, height >= 68 & height <= 72, "3"))
我在朱莉娅的尝试如下:
using DataFrames
using DataFramesMeta
using Lazy
using RDatasets
women = @> begin
"datasets"
dataset("women")
DataArray()
vcat([[NA NA]; [NA NA]])
end
women_new = DataFrame(Height = women[:, 1], Weight = women[:, 2]);
women_new[16, 2] = 1;
我的第一个问题是,有没有办法像{R}一样立即在1
输入vcat([[NA 1]; [NA NA]])
?如果我这样做,它会返回以下错误:
MethodError: Cannot `convert` an object of type DataArrays.NAtype to an object of type Int64
This may have arisen from a call to the constructor Int64(...),
since type constructors fall back to convert methods.
in macro expansion at multidimensional.jl:431 [inlined]
in macro expansion at cartesian.jl:64 [inlined]
in macro expansion at multidimensional.jl:429 [inlined]
in _unsafe_batchsetindex!(::Array{Int64,2}, ::Base.Repeated{DataArrays.NAtype}, ::UnitRange{Int64}, ::UnitRange{Int64}) at multidimensional.jl:421
in setindex!(::Array{Int64,2}, ::DataArrays.NAtype, ::UnitRange{Int64}, ::UnitRange{Int64}) at abstractarray.jl:832
in cat_t(::Int64, ::Type{T}, ::DataArrays.NAtype, ::Vararg{Any,N}) at abstractarray.jl:1098
in hcat(::DataArrays.NAtype, ::Int64) at abstractarray.jl:1180
in include_string(::String, ::String) at loading.jl:441
in include_string(::String, ::String, ::Int64) at eval.jl:30
in include_string(::Module, ::String, ::String, ::Int64, ::Vararg{Int64,N}) at eval.jl:34
in (::Atom.##53#56{String,Int64,String})() at eval.jl:50
in withpath(::Atom.##53#56{String,Int64,String}, ::String) at utils.jl:30
in withpath(::Function, ::String) at eval.jl:38
in macro expansion at eval.jl:49 [inlined]
in (::Atom.##52#55{Dict{String,Any}})() at task.jl:60
我的第二个问题是,有没有办法将DataArray
转换为DataFrame
?在这种情况下,列名称变为X1
,X2
,...
或DataFrame
中的任何默认名称,因为DataArray
没有列名。我认为它比输入以下内容更简洁:
women_new = DataFrame(Height = women[:, 1], Weight = women[:, 2]);
我希望我可以简单地执行convert(DataFrame, women)
并简单地重命名列名。但是这种转换不起作用。以下是我在R案例中对转换或突变的尝试。
@> begin
women_new
@where !isna(:Height)
@transform(
Sector = NA,
Sector = ifelse(:Height .>= 0 & :Height .<= 60, 1,
ifelse(:Height .>= 61 & :Height .<= 67, 2,
ifelse(:Height .>= 68 & :Height .<= 72, 3, NA)))
)
end
但这会回来:
15×3 DataFrames.DataFrame
│ Row │ Height │ Weight │ Sector│
├─────┼────────┼────────┼───────┤
│ 1 │ 58 │ 115 │ 1 │
│ 2 │ 59 │ 117 │ 1 │
│ 3 │ 60 │ 120 │ 1 │
│ 4 │ 61 │ 123 │ 1 │
│ 5 │ 62 │ 126 │ 1 │
│ 6 │ 63 │ 129 │ 1 │
│ 7 │ 64 │ 132 │ 1 │
│ 8 │ 65 │ 135 │ 1 │
│ 9 │ 66 │ 139 │ 1 │
│ 10 │ 67 │ 142 │ 1 │
│ 11 │ 68 │ 146 │ 1 │
│ 12 │ 69 │ 150 │ 1 │
│ 13 │ 70 │ 154 │ 1 │
│ 14 │ 71 │ 159 │ 1 │
│ 15 │ 72 │ 164 │ 1 │
这不等同于R,我也尝试了以下内容:
@> begin
women_new
@where !isna(:Height)
@transform(
Sector = NA,
Sector = :Height .>= 0 & :Height .<= 60 ? 1 :
:Height .>= 61 & :Height .<= 67 ? 2 :
:Height .>= 68 & :Height .<= 72 ? 3 :
NA;
)
end
但是返回以下错误:
TypeError: non-boolean (DataArrays.DataArray{Bool,1}) used in boolean context
in (::###469#303)(::DataArrays.DataArray{Int64,1}) at DataFramesMeta.jl:55
in (::##298#302)(::DataFrames.DataFrame) at DataFramesMeta.jl:295
in #transform#38(::Array{Any,1}, ::Function, ::DataFrames.DataFrame) at DataFramesMeta.jl:270
in (::DataFramesMeta.#kw##transform)(::Array{Any,1}, ::DataFramesMeta.#transform, ::DataFrames.DataFrame) at <missing>:0
in include_string(::String, ::String) at loading.jl:441
in include_string(::String, ::String, ::Int64) at eval.jl:30
in include_string(::Module, ::String, ::String, ::Int64, ::Vararg{Int64,N}) at eval.jl:34
in (::Atom.##53#56{String,Int64,String})() at eval.jl:50
in withpath(::Atom.##53#56{String,Int64,String}, ::String) at utils.jl:30
in withpath(::Function, ::String) at eval.jl:38
in macro expansion at eval.jl:49 [inlined]
in (::Atom.##52#55{Dict{String,Any}})() at task.jl:60
如果你能帮助我解决这个问题我很感激。最后,我的最后一个问题是,有没有办法缩短我的代码,就像在R中那样,但仍然优雅?
答案 0 :(得分:2)
我明白了。对运算符优先级有影响,我认为不需要括号。
using DataFrames
using DataFramesMeta
using Lazy
using RDatasets
women = dataset("datasets", "women");
women_new = vcat(
women,
DataFrame(Height = [NA; NA], Weight = @data [1; NA])
)
@> begin
women_new
@where !isna(:Height)
@transform(
Class = NA,
Class = ifelse((:Height .>= 0) & (:Height .<= 60), 1,
ifelse((:Height .>= 61) & (:Height .<= 67), 2,
ifelse((:Height .>= 68) & (:Height .<= 72), 3, NA)))
)
end
更新:上述代码可以进一步简化为:
@> begin
women_new
@where !isna(:Height)
@transform(
Class = @> begin
function (x)
0 <= x <= 60 ? 1 :
61 <= x <= 67 ? 2 :
68 <= x <= 72 ? 3 :
NA
end
map(:Height)
end
)
end
或者另一种方法是使用Query.jl,如下所示:
using DataFrames
using Query
using RDatasets
women = dataset("datasets", "women");
women_new = vcat(
women,
DataFrame(Height = [NA; NA], Weight = @data [1; NA])
)
@from i in women_new begin
@where !isnull(i.Height)
@select {
i.Height, i.Weight,
class = 0 <= i.Height <= 60 ? 1 :
61 <= i.Height <= 67 ? 2 :
68 <= i.Height <= 72 ? 3 :
0
}
@collect DataFrame
end
现在输出正确:
15×3 DataFrames.DataFrame
│ Row │ Height │ Weight │ Class │
├─────┼────────┼────────┼───────┤
│ 1 │ 58 │ 115 │ 1 │
│ 2 │ 59 │ 117 │ 1 │
│ 3 │ 60 │ 120 │ 1 │
│ 4 │ 61 │ 123 │ 2 │
│ 5 │ 62 │ 126 │ 2 │
│ 6 │ 63 │ 129 │ 2 │
│ 7 │ 64 │ 132 │ 2 │
│ 8 │ 65 │ 135 │ 2 │
│ 9 │ 66 │ 139 │ 2 │
│ 10 │ 67 │ 142 │ 2 │
│ 11 │ 68 │ 146 │ 3 │
│ 12 │ 69 │ 150 │ 3 │
│ 13 │ 70 │ 154 │ 3 │
│ 14 │ 71 │ 159 │ 3 │
│ 15 │ 72 │ 164 │ 3 │
如果我们不想过滤NA并使用完整的数据,那么我能做的就是:
@> begin
women_new
@transform(
Height_New = NA,
Height_New = ifelse(isna(:Height), -1, :Height))
@transform(
Class = NA,
Class = ifelse(:Height_New == -1, NA,
ifelse((:Height_New .>= 0) & (:Height_New .<= 60), 1,
ifelse((:Height_New .>= 61) & (:Height_New .<= 67), 2,
ifelse((:Height_New .>= 68) & (:Height_New .<= 72), 3, NA))))
)
delete!(:Height_New)
end
更新:上述代码可以进一步简化为:
@> begin
women_new
@transform(
Class = @> begin
function (x)
isna(x) ? NA :
0 <= x <= 60 ? 1 :
61 <= x <= 67 ? 2 :
68 <= x <= 72 ? 3 :
NA
end
map(:Height)
end
)
end
或者另一种方法是使用Query.jl,如下所示:
@from i in women_new begin
@select {
i.Height, i.Weight,
class = 0 <= i.Height <= 60 ? 1 :
61 <= i.Height <= 67 ? 2 :
68 <= i.Height <= 72 ? 3 :
0
}
@collect DataFrame
end
输出:
17×3 DataFrames.DataFrame
│ Row │ Height │ Weight │ Class │
├─────┼────────┼────────┼───────┤
│ 1 │ 58 │ 115 │ 1 │
│ 2 │ 59 │ 117 │ 1 │
│ 3 │ 60 │ 120 │ 1 │
│ 4 │ 61 │ 123 │ 2 │
│ 5 │ 62 │ 126 │ 2 │
│ 6 │ 63 │ 129 │ 2 │
│ 7 │ 64 │ 132 │ 2 │
│ 8 │ 65 │ 135 │ 2 │
│ 9 │ 66 │ 139 │ 2 │
│ 10 │ 67 │ 142 │ 2 │
│ 11 │ 68 │ 146 │ 3 │
│ 12 │ 69 │ 150 │ 3 │
│ 13 │ 70 │ 154 │ 3 │
│ 14 │ 71 │ 159 │ 3 │
│ 15 │ 72 │ 164 │ 3 │
│ 16 │ NA │ 1 │ NA │
│ 17 │ NA │ NA │ NA │
在这种情况下,代码变得混乱,因为在ifelse
的第一个参数中还没有处理NA的方法。