我有一些数据,这些数据是以我要为客户准备的平面文件格式提供的。它是分层数据,但并没有填写所有数据,而且由于涉及许多不同的子级别,因此您不能只是简单地进行填写。数字始终是4位数字,表示特定的数字。
这是一个报告,可以发布到数十个具有数千行数据的子组。
这是R中的一个示例:
L1 <- c("Main1", rep(NA, 21), "Main2", "Main3")
L2 <- c(NA, "Sub2_1", rep(NA, 22))
L3 <- c(NA, NA, "Sub3_1", rep(NA, 17), "Sub3_2", rep(NA, 3))
L4 <- c(rep(NA, 3), "Sub4_1", rep(NA, 9), "Sub4_2", rep(NA, 7), "0015", rep(NA, 2))
L5 <- c(rep(NA, 4), "Sub5_1", NA, NA, "Sub5_2", NA, "Sub5_3", rep(NA, 4), "Sub5_5", rep(NA, 9))
L6 <- c(rep(NA, 5), "1111", "2885", NA, "0001", NA, "Sub6_1", rep(NA, 4), "Sub6_2", rep(NA, 8))
L7 <- c(rep(NA, 11), "Sub7_1", rep(NA, 4), "Sub7_2", rep(NA, 7))
L8 <- c(rep(NA, 12), "0011", rep(NA, 4), "9494", "Sub8_1", rep(NA, 5))
L9 <- c(rep(NA, 19), "8479", rep(NA, 4))
df <- data.frame(L1, L2, L3, L4, L5, L6, L7, L8, L9)
我想要这样的输出,因为我们真正需要查找的是四位数的“代码”:
code_f <- c("1111", "2885", "0001", "0011", "9494", "8479", "0015", NA, NA)
L1_f <- c(rep("Main1", 7), "Main2", "Main3")
L2_f <- c(rep("Sub2_1", 7), NA, NA)
L3_f <- c(rep("Sub3_1", 6), "Sub3_2", NA, NA)
L4_f <- c(rep("Sub4_1", 4), rep("Sub4_2", 2), rep(NA, 3))
L5_f <- c(rep("Sub5_1", 2), "Sub5_2", "Sub5_3", rep("Sub5_5", 2), rep(NA, 3))
L6_f <- c(rep(NA, 3), "Sub6_1", rep("Sub6_3", 2), rep(NA, 3))
L7_f <- c(rep(NA, 3), "Sub7_1", rep("Sub7_2", 2), rep(NA, 3))
L8_f <- c(rep(NA, 5), "Sub8_1", rep(NA, 3))
df_f <- data.frame(code_f, L1_f, L2_f, L3_f, L4_f, L5_f, L6_f, L7_f, L8_f)
答案 0 :(得分:3)
我在您的数据中看不到0015
,因此无法确定其来源。但是根据您提供的信息,我们可以做到:
mm = function(data,i=1){
dat = tidyr::fill(data,!!!names(data)[i])%>%
group_by(.dots=names(data)[1:i])
if(i<ncol(dat)) mm(dat,i+1) else data
}
df%>%
mm%>%
{b =lift(paste)(.); b[grepl("\\b\\d+\\b",b)| rowSums(is.na({.}[-1]))==ncol(df)-1]}%>%
sub("(.*?)(\\b\\d+\\b|NA)","\\2 \\1",.)%>%
read.table(text=.,fill=T,colClasses = "character")%>%
mutate(r=rowSums(is.na(.))==ncol(.)-1)%>%
group_by(V2)%>%
filter(n()==1 & r|!r)%>%
select(-r)%>%data.frame()
V1 V2 V3 V4 V5 V6 V7 V8 V9
1 1111 Main1 Sub2_1 Sub3_1 Sub4_1 Sub5_1 <NA> <NA> <NA>
2 2885 Main1 Sub2_1 Sub3_1 Sub4_1 Sub5_1 <NA> <NA> <NA>
3 0001 Main1 Sub2_1 Sub3_1 Sub4_1 Sub5_2 <NA> <NA> <NA>
4 0011 Main1 Sub2_1 Sub3_1 Sub4_1 Sub5_3 Sub6_1 Sub7_1 <NA>
5 9494 Main1 Sub2_1 Sub3_1 Sub4_2 Sub5_5 Sub6_2 Sub7_2 <NA>
6 8479 Main1 Sub2_1 Sub3_1 Sub4_2 Sub5_5 Sub6_2 Sub7_2 Sub8_1
7 15 Main1 Sub2_1 Sub3_2 <NA> <NA> <NA> <NA> <NA>
8 <NA> Main2 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
9 <NA> Main3 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
import pandas as pd
import numpy as np
import re
#df = pd.read_clipboard()
#df[df=="<NA>"]=np.nan
#df['L1']=df['L1'].ffill()
def mmpy(data,m,i=0):
data = data.copy(deep=True)
data.iloc[:,i] = m[data.columns[i]].ffill()
m = data.groupby(list(data.columns[0:i+1]))
if i < len(data.columns)-1: return mmpy(data,m,i+1)
return data
s = mmpy(df,df.copy())
a = "\n".join([" ".join([str(k) for k in i.values()]) for i in s.T.to_dict().values()])
b = re.sub(r"^(.*?)(\b\d+\b|nan)",r"\2 \1",a,flags=re.M)
w = pd.DataFrame([i.split() for i in re.findall(r"^\d+.*$|.*Main\S* \\D*$",b,re.M)])
0 1 2 3 4 5 6 7 8
0 nan Main1 nan nan nan nan nan nan nan
1 1111 Main1 Sub2_1 Sub3_1 Sub4_1 Sub5_1 nan nan nan
2 2885 Main1 Sub2_1 Sub3_1 Sub4_1 Sub5_1 nan nan nan
3 0001 Main1 Sub2_1 Sub3_1 Sub4_1 Sub5_2 nan nan nan
4 0011 Main1 Sub2_1 Sub3_1 Sub4_1 Sub5_3 Sub6_1 Sub7_1 nan
5 9494 Main1 Sub2_1 Sub3_1 Sub4_2 Sub5_5 Sub6_2 Sub7_2 nan
6 8479 Main1 Sub2_1 Sub3_1 Sub4_2 Sub5_5 Sub6_2 Sub7_2 Sub8_1
7 15 Main1 Sub2_1 Sub3_2 nan nan nan nan nan
8 nan Main2 nan nan nan nan nan nan nan
9 nan Main3 nan nan nan nan nan nan nan
答案 1 :(得分:1)
不确定我的问题是否100%正确,但这似乎可以复制您想要的输出(假设df
中没有其他数据;如Onyambu的评论中所述)。
#change format of data
vec=c(t(as.matrix(df)))
subLevels=2:8
#regex patterns for 4-digit number and levels
patterns=c("[0-9]{4,4}","Main[0-9]{1,}",paste0("Sub",2:maxSub,"_[0-9]{1,}"))
#find indices for each level
idxList=lapply(patterns,grep,vec)
#replace all data that does not correspond to a given level by NA
valList=lapply(idxList,function(x) {tmp=vec;tmp[-x]=NA;tmp})
#the zoo library has a function to move missing values forward -> na.locf
library(zoo)
#for each 4-digit number and each level, find the respective level-string
data.frame(code_f=na.omit(valList[[1]]),
do.call("cbind",
lapply(valList[-1],
function(x) na.locf(x,na.rm=FALSE)[idxList[[1]]])))
# code_f X1 X2 X3 X4 X5 X6 X7 X8
# 1 1111 Main1 Sub2_1 Sub3_1 Sub4_1 Sub5_1 <NA> <NA> <NA>
# 2 2885 Main1 Sub2_1 Sub3_1 Sub4_1 Sub5_1 <NA> <NA> <NA>
# 3 0001 Main1 Sub2_1 Sub3_1 Sub4_1 Sub5_2 <NA> <NA> <NA>
# 4 0011 Main1 Sub2_1 Sub3_1 Sub4_1 Sub5_3 Sub6_1 Sub7_1 <NA>
# 5 9494 Main1 Sub2_1 Sub3_1 Sub4_2 Sub5_5 Sub6_2 Sub7_2 <NA>
# 6 8479 Main1 Sub2_1 Sub3_1 Sub4_2 Sub5_5 Sub6_2 Sub7_2 Sub8_1