Question

我有以下数据框，其中包含我想要处理的名称和城市的定义，我不知道如何解释它，所以我在下面列出了输入和输出表。

输入：

+---------+-----------+------------+---------------+
| Varname | Component |   names    |    cities     |
+---------+-----------+------------+---------------+
| A       | B         | Jack,Bruce | New york      |
| B       |           | Cathy      | Boston,Miami  |
| C       |           | Bob        | New york      |
| D       | C         | Dick,Nancy | Austin,Dallas |
| E       | A,C       |            |               |
| F       |           | Mandy      | Manchester    |
+---------+-----------+------------+---------------+

输出：

+---------+-----------+----------------------+------------------------+
| Varname | Component |        names         |         cities         |
+---------+-----------+----------------------+------------------------+
| A       |           | Jack,Bruce,Cathy     | New york,Boston,Miami  |
| B       |           | Cathy                | Boston,Miami           |
| C       |           | Bob                  | New york               |
| D       |           | Dick,Nancy,Bob       | Austin,Dallas,New york |
| E       |           | Jack,Bruce,Cathy,Bob | New york,Boston,Miami  |
| F       |           | Mandy                | Manchester             |
+---------+-----------+----------------------+------------------------+

正如您希望看到的那样，我想获取组件列，并为该列中的每个Varnames查找名称和城市（实际上我有更多的列）并将它们组合在一起以便我拥有一张完整的表格。这可能吗？我不知道从哪里开始。我的表格不是很大，因此可以应用for（）{}语句。

- ＆gt;编辑，我可能没有给出一个正确的例子，我用一些与我的数据更一致的东西替换了输入。

输入的dput（）

结构（列表（Varname =结构（1：6，.Label = c（＆＃34; A＆＃34;，＆＃34; B＆＃34;，＆＃34; C＆＃34;，＆＃34; D＆＃34;，＆＃34; E＆＃34;，＆＃34; F＆＃34;），class =＆＃34; factor＆＃34;），Component = structure（c（3L，1L， 1L，4L，2L，1L），。标签= c（＆＃34;＆＃34;，＆＃34; A，C＆＃34;，＆＃34; B＆＃34;，＆＃34; C＆＃34 ;），class =＆＃34; factor＆＃34;），名称=结构（c（5L，3L，2L，4L，1L，6L）,. Label = c（＆＃34;＆＃34;，＆＃34; Bob＆＃34;，＆＃34; Cathy＆＃34;，＆＃34; Dick，Nancy＆＃34;，＆＃34; Jack，Bruce＆＃34;，＆＃34; Mandy＆＃34;）， class =＆＃34; factor＆＃34;），城市=结构（c（5L，3L，5L，2L，1L，4L）,.标签= c（＆＃34;＆＃34;，＆＃34; Austin，Dallas＆＃34;，＆＃34; Boston，Miami＆＃34;，＆＃34; Manchester＆＃34;，＆＃34; New York＆＃34; ），class =＆＃34; factor＆＃34;））,. Name = c（＆＃34; Varname＆＃34;，＆＃34; Component＆＃34;，＆＃34; names＆＃34;，＆＃34; ; cities＆＃34;），class =＆＃34; data.frame＆＃34;，row.names = c（NA，-6L））

Answer 1

不是最吸引人的R代码（绝对不是最有效的），但它完成了工作。希望其他人可以改进它。

starting_df <- read.table(text="Varname|Component|names|cities     
A||Jack,Bruce|New york
B||Cathy|Boston,Miami
C|A|Bob|New york
D|C|Dick,Nancy|Austin,Dallas",header=T, sep="|", stringsAsFactors=F)

##Grab all the rows whose Component values are in the Varname column and vice-versa
intermediate_df <- starting_df[(starting_df$Varname %in% starting_df$Component | starting_df$Component %in% starting_df$Varname ),]

##Change the rows in the names and cities columns to match your desired output (sorry about the for loop)
for (x in 1:nrow(intermediate_df)) {
  if (x == 1) {
    intermediate_df[x,'names'] <- intermediate_df$names[x]
    intermediate_df[x,'cities'] <- intermediate_df$cities[x]
  } else {
    intermediate_df[x,'names'] <- paste0(unique(unlist(strsplit(paste(intermediate_df$names[x-1],intermediate_df$names[x],sep = ","),split=","))),collapse=",")
    intermediate_df[x,'cities'] <- paste0(unique(unlist(strsplit(paste(intermediate_df$cities[x-1],intermediate_df$cities[x],sep = ","),split=","))),collapse=",")
  }
}

##Binding the new dataset with the starting dataset (but only Varnames that are in the new dataset)
final_df <- rbind(intermediate_df,starting_df[!(starting_df$Varname %in% intermediate_df$Varname),])

##Order by the Varname column to get the desired output
final_df <- final_df[order(final_df$Varname),]

您想要的输出：

 Varname Component names                     cities                
 A                 Jack,Bruce                New york              
 B                 Cathy                     Boston,Miami          
 C       A         Jack,Bruce,Bob            New york              
 D       C         Jack,Bruce,Bob,Dick,Nancy New york,Austin,Dallas

编辑新数据集：

这个使用了for loops（我在R中根本不喜欢做的事情），但似乎产生了一些东西：

##Setting up the new dataset
starting_df1 <- structure(list(Varname = structure(1:6, .Label = c("A", "B", "C", "D", "E", "F"), class = "factor"), 
                              Component = structure(c(3L, 1L, 1L, 4L, 2L, 1L), .Label = c("", "A,C", "B", "C"), class = "factor"), 
                              names = structure(c(5L, 3L, 2L, 4L, 1L, 6L), .Label = c("", "Bob", "Cathy", "Dick,Nancy", "Jack,Bruce", "Mandy"), class = "factor"), 
                              cities = structure(c(5L, 3L, 5L, 2L, 1L, 4L), .Label = c("", "Austin,Dallas", "Boston,Miami", "Manchester", "New york" ), class = "factor")), 
                         .Names = c("Varname", "Component", "names", "cities"), class = "data.frame", row.names = c(NA, -6L ))

##Change the fields from factor variables to characters (so that you can use them for concatenating)
starting_df1 <- data.frame(apply(starting_df1, 2, FUN = function(x) {
  as.character(x)
}), stringsAsFactors = F)

##Nested for loops: For every row that has a value for the Component column, find its matches (and their indices) in the Varname column
##Then for the combination of indices to change the values you wish to change through concatenation operations for both the names and cities columns
for (i in which(!nchar(starting_df1$Component)==0)) {
  holder <- which(grepl(paste0(unlist(strsplit(starting_df1$Component[i],split=",")),collapse="|"),starting_df1$Varname))
  for (j in holder) {
    if (nchar(starting_df1$names[i])!=0) {
      starting_df1[i, "names"] <- paste0(unique(unlist(strsplit(paste(starting_df1$names[i],starting_df1$names[j],sep = ","),split=","))),collapse=",")
      starting_df1[i, "cities"] <- paste0(unique(unlist(strsplit(paste(starting_df1$cities[i],starting_df1$cities[j],sep = ","),split=","))),collapse=",")
    } else {
      starting_df1[i, "names"] <- starting_df1$names[j]
      starting_df1[i, "cities"] <- starting_df1$cities[j]
    }
  }
}

print(starting_df1, row.names = F, right = F)

期望的输出：

 Varname Component names                cities                
 A       B         Jack,Bruce,Cathy     New york,Boston,Miami 
 B                 Cathy                Boston,Miami          
 C                 Bob                  New york              
 D       C         Dick,Nancy,Bob       Austin,Dallas,New york
 E       A,C       Jack,Bruce,Cathy,Bob New york,Boston,Miami 
 F                 Mandy                Manchester

R数据帧基于列

1 个答案:

编辑新数据集：

期望的输出：