使用R或Python整理数据的分层数据列表

时间:2018-07-20 21:41:40

标签: python r hierarchy hierarchical-data

我有一些数据,这些数据是以我要为客户准备的平面文件格式提供的。它是分层数据,但并没有填写所有数据,而且由于涉及许多不同的子级别,因此您不能只是简单地进行填写。数字始终是4位数字,表示特定的数字。

这是一个报告,可以发布到数十个具有数千行数据的子组。

这是R中的一个示例:

L1 <- c("Main1", rep(NA, 21), "Main2", "Main3")
L2 <- c(NA, "Sub2_1", rep(NA, 22))
L3 <- c(NA, NA, "Sub3_1", rep(NA, 17), "Sub3_2", rep(NA, 3))
L4 <- c(rep(NA, 3), "Sub4_1", rep(NA, 9), "Sub4_2", rep(NA, 7), "0015", rep(NA, 2))
L5 <- c(rep(NA, 4), "Sub5_1", NA, NA, "Sub5_2", NA, "Sub5_3", rep(NA, 4), "Sub5_5", rep(NA, 9))
L6 <- c(rep(NA, 5), "1111", "2885", NA, "0001", NA, "Sub6_1", rep(NA, 4), "Sub6_2", rep(NA, 8))
L7 <- c(rep(NA, 11), "Sub7_1", rep(NA, 4), "Sub7_2", rep(NA, 7))
L8 <- c(rep(NA, 12), "0011", rep(NA, 4), "9494", "Sub8_1", rep(NA, 5))
L9 <- c(rep(NA, 19), "8479", rep(NA, 4))

df <- data.frame(L1, L2, L3, L4, L5, L6, L7, L8, L9)

我想要这样的输出,因为我们真正需要查找的是四位数的“代码”:

code_f <- c("1111", "2885", "0001", "0011", "9494", "8479", "0015", NA, NA)
L1_f <- c(rep("Main1", 7), "Main2", "Main3")
L2_f <- c(rep("Sub2_1", 7), NA, NA)
L3_f <- c(rep("Sub3_1", 6), "Sub3_2", NA, NA)
L4_f <- c(rep("Sub4_1", 4), rep("Sub4_2", 2), rep(NA, 3))
L5_f <- c(rep("Sub5_1", 2), "Sub5_2", "Sub5_3", rep("Sub5_5", 2), rep(NA, 3))
L6_f <- c(rep(NA, 3), "Sub6_1", rep("Sub6_3", 2), rep(NA, 3))          
L7_f <- c(rep(NA, 3), "Sub7_1", rep("Sub7_2", 2), rep(NA, 3))
L8_f <- c(rep(NA, 5), "Sub8_1", rep(NA, 3))

df_f <- data.frame(code_f, L1_f, L2_f, L3_f, L4_f, L5_f, L6_f, L7_f, L8_f)

2 个答案:

答案 0 :(得分:3)

我在您的数据中看不到0015,因此无法确定其来源。但是根据您提供的信息,我们可以做到:

mm = function(data,i=1){
  dat = tidyr::fill(data,!!!names(data)[i])%>%
    group_by(.dots=names(data)[1:i])
  if(i<ncol(dat)) mm(dat,i+1) else data
}

 df%>%
  mm%>%
  {b =lift(paste)(.); b[grepl("\\b\\d+\\b",b)| rowSums(is.na({.}[-1]))==ncol(df)-1]}%>%
  sub("(.*?)(\\b\\d+\\b|NA)","\\2 \\1",.)%>%
  read.table(text=.,fill=T,colClasses = "character")%>%
  mutate(r=rowSums(is.na(.))==ncol(.)-1)%>%
  group_by(V2)%>%
  filter(n()==1 & r|!r)%>%
  select(-r)%>%data.frame()
    V1    V2     V3     V4     V5     V6     V7     V8     V9
1 1111 Main1 Sub2_1 Sub3_1 Sub4_1 Sub5_1   <NA>   <NA>   <NA>
2 2885 Main1 Sub2_1 Sub3_1 Sub4_1 Sub5_1   <NA>   <NA>   <NA>
3 0001 Main1 Sub2_1 Sub3_1 Sub4_1 Sub5_2   <NA>   <NA>   <NA>
4 0011 Main1 Sub2_1 Sub3_1 Sub4_1 Sub5_3 Sub6_1 Sub7_1   <NA>
5 9494 Main1 Sub2_1 Sub3_1 Sub4_2 Sub5_5 Sub6_2 Sub7_2   <NA>
6 8479 Main1 Sub2_1 Sub3_1 Sub4_2 Sub5_5 Sub6_2 Sub7_2 Sub8_1
7   15 Main1 Sub2_1 Sub3_2   <NA>   <NA>   <NA>   <NA>   <NA>
8 <NA> Main2   <NA>   <NA>   <NA>   <NA>   <NA>   <NA>   <NA>
9 <NA> Main3   <NA>   <NA>   <NA>   <NA>   <NA>   <NA>   <NA>

python版本将是:

import pandas as pd
import numpy as np
import re

#df = pd.read_clipboard()
 #df[df=="<NA>"]=np.nan
#df['L1']=df['L1'].ffill()
def mmpy(data,m,i=0):
    data = data.copy(deep=True)
    data.iloc[:,i] = m[data.columns[i]].ffill()
    m = data.groupby(list(data.columns[0:i+1]))
    if i < len(data.columns)-1:  return mmpy(data,m,i+1)
    return data


s = mmpy(df,df.copy())
a = "\n".join([" ".join([str(k) for k in i.values()]) for i in s.T.to_dict().values()])
b = re.sub(r"^(.*?)(\b\d+\b|nan)",r"\2 \1",a,flags=re.M)

w = pd.DataFrame([i.split() for i in re.findall(r"^\d+.*$|.*Main\S* \\D*$",b,re.M)])

      0      1       2       3       4       5       6       7       8
0   nan  Main1     nan     nan     nan     nan     nan     nan     nan
1  1111  Main1  Sub2_1  Sub3_1  Sub4_1  Sub5_1     nan     nan     nan
2  2885  Main1  Sub2_1  Sub3_1  Sub4_1  Sub5_1     nan     nan     nan
3  0001  Main1  Sub2_1  Sub3_1  Sub4_1  Sub5_2     nan     nan     nan
4  0011  Main1  Sub2_1  Sub3_1  Sub4_1  Sub5_3  Sub6_1  Sub7_1     nan
5  9494  Main1  Sub2_1  Sub3_1  Sub4_2  Sub5_5  Sub6_2  Sub7_2     nan
6  8479  Main1  Sub2_1  Sub3_1  Sub4_2  Sub5_5  Sub6_2  Sub7_2  Sub8_1
7    15  Main1  Sub2_1  Sub3_2     nan     nan     nan     nan     nan
8   nan  Main2     nan     nan     nan     nan     nan     nan     nan
9   nan  Main3     nan     nan     nan     nan     nan     nan     nan

答案 1 :(得分:1)

不确定我的问题是否100%正确,但这似乎可以复制您想要的输出(假设df中没有其他数据;如Onyambu的评论中所述)。

#change format of data
vec=c(t(as.matrix(df)))
subLevels=2:8
#regex patterns for 4-digit number and levels
patterns=c("[0-9]{4,4}","Main[0-9]{1,}",paste0("Sub",2:maxSub,"_[0-9]{1,}"))
#find indices for each level
idxList=lapply(patterns,grep,vec)
#replace all data that does not correspond to a given level by NA
valList=lapply(idxList,function(x) {tmp=vec;tmp[-x]=NA;tmp})

#the zoo library has a function to move missing values forward -> na.locf
library(zoo)
#for each 4-digit number and each level, find the respective level-string 
data.frame(code_f=na.omit(valList[[1]]),
           do.call("cbind",
                   lapply(valList[-1],
                          function(x) na.locf(x,na.rm=FALSE)[idxList[[1]]])))

#   code_f    X1     X2     X3     X4     X5     X6     X7     X8
# 1   1111 Main1 Sub2_1 Sub3_1 Sub4_1 Sub5_1   <NA>   <NA>   <NA>
# 2   2885 Main1 Sub2_1 Sub3_1 Sub4_1 Sub5_1   <NA>   <NA>   <NA>
# 3   0001 Main1 Sub2_1 Sub3_1 Sub4_1 Sub5_2   <NA>   <NA>   <NA>
# 4   0011 Main1 Sub2_1 Sub3_1 Sub4_1 Sub5_3 Sub6_1 Sub7_1   <NA>
# 5   9494 Main1 Sub2_1 Sub3_1 Sub4_2 Sub5_5 Sub6_2 Sub7_2   <NA>
# 6   8479 Main1 Sub2_1 Sub3_1 Sub4_2 Sub5_5 Sub6_2 Sub7_2 Sub8_1