我需要达到以下条件,
result
使用的数据集:df_sales3
if column Avg_sales_greaterthan_7 == 'YES'
{
column Avg_sales_after_outliner_rejection == column Avg_cache_out
}
else if column Avg_sales_greaterthan_7 == 'NO'
{
column Avg_sales_after_outliner_rejection == column Avg_sales_for_3mon
}
我用这个sparkR代码来实现这个条件:
|Location_code| Avg_cache | Avg_sales_for_3mon | Avg_sales_greaterthan_7|Avg_cache_out|Avg_sales_after_outliner_rejection|
+-------------+------------------+---------------------+------------------------+-------------+----------------------------------+
| 1003| 752.0| 8.17| YES| 5.15| 5.15|
| 1010| 1906.0| 13.33| NO | 20.72| 13.33|
| 1014| 7965.0| 86.58| YES| 80.32| 80.32|
| 1031|3199.6400000000003| 34.78| YES| 30.88| 30.88|
| 1040|1690.5069999999998| 18.38| YES| 14.21| 14.21|
| 1047| 1000.0| 10.87| NO | 8.73| 10.87|
| 1061| 1133.0| 12.32| NO | 8.61| 12.32|
是否有任何有效的方法来编写此代码,例如使用函数。
答案 0 :(得分:2)
您可以使用原始SQL和CASE WHEN
表达式:
df <- createDataFrame(sqlContext,
data.frame(foo=c(TRUE, FALSE, TRUE), x=c(1, 0, 3), y=c(-1, -3, -5)))
registerTempTable(df, "df")
head(sql(sqlContext, "SELECT *, CASE WHEN foo THEN x ELSE y END as bar FROM df"))
## foo x y bar
## 1 TRUE 1 -1 1
## 2 FALSE 0 -3 -3
## 3 TRUE 3 -5 3
使用when
/ otherwise
这样的函数:
otherwise(when(df$foo == TRUE, df$x), df$y)
应该也可以正常工作,但看起来它在1.5
中被破坏了答案 1 :(得分:2)
使用sqldf
你可以这样做
library(sqldf)
sqldf("select * , case when col4 == 'YES' then col5 else col3 end new from data")
使用apply
data$new = as.numeric(apply(data, 1,
function(x) if(x['col4'] == "YES") x['col5'] else x['col3']))
#> data
# col1 col2 col3 col4 col5 col6 new
#1 1003 752.000 8.17 YES 5.15 5.15 5.15
#2 1010 1906.000 13.33 NO 20.72 13.33 13.33
#3 1014 7965.000 86.58 YES 80.32 80.32 80.32
#4 1031 3199.640 34.78 YES 30.88 30.88 30.88
#5 1040 1690.507 18.38 YES 14.21 14.21 14.21
#6 1047 1000.000 10.87 NO 8.73 10.87 10.87
#7 1061 1133.000 12.32 NO 8.61 12.32 12.32
使用data.table
你可以这样做
library(data.table)
setDT(data)[, new := if(col4 == 'YES') col5 else col3, by = 1:nrow(data)]
#> data
# col1 col2 col3 col4 col5 col6 new
#1: 1003 752.000 8.17 YES 5.15 5.15 5.15
#2: 1010 1906.000 13.33 NO 20.72 13.33 13.33
#3: 1014 7965.000 86.58 YES 80.32 80.32 80.32
#4: 1031 3199.640 34.78 YES 30.88 30.88 30.88
#5: 1040 1690.507 18.38 YES 14.21 14.21 14.21
#6: 1047 1000.000 10.87 NO 8.73 10.87 10.87
#7: 1061 1133.000 12.32 NO 8.61 12.32 12.32
示例数据
data = structure(list(col1 = c(1003L, 1010L, 1014L, 1031L, 1040L, 1047L,
1061L), col2 = c(752, 1906, 7965, 3199.64, 1690.507, 1000, 1133
), col3 = c(8.17, 13.33, 86.58, 34.78, 18.38, 10.87, 12.32),
col4 = structure(c(2L, 1L, 2L, 2L, 2L, 1L, 1L), .Label = c("NO",
"YES"), class = "factor"), col5 = c(5.15, 20.72, 80.32, 30.88,
14.21, 8.73, 8.61), col6 = c(5.15, 13.33, 80.32, 30.88, 14.21,
10.87, 12.32)), .Names = c("col1", "col2", "col3", "col4",
"col5", "col6"), class = "data.frame", row.names = c(NA, -7L))
答案 2 :(得分:0)
我们可以试试这个
temp <- df_sales3$greaterthan_7 == "YES"
df_sales3$after_outliner_rejection[temp] <- df_sales3$cache_out[temp]
df_sales3$after_outliner_rejection[!temp] <- df_sales3$for_3mon[!temp]
请注意,为了清楚起见,我修改了列名称。