根据dplyr中每列中的数据合并数据帧

时间:2019-04-17 18:12:10

标签: r dplyr

说我有一些网络数据,如下所示:

url = "https://www.eastbay.com/category/mens/shoes.html?query=%3Arelevance%3Agender%3A200000%3AproductType%3A200005%3Abrand%3AChampion%3Abrand%3AConverse%3Abrand%3AFila%3Abrand%3AJordan%3Abrand%3ANew+Balance%3Abrand%3ANike%3Abrand%3ANike+SB%3Abrand%3APUMA%3Abrand%3AReebok%3Abrand%3ASalomon%3Abrand%3AThe+North+Face%3Abrand%3ATimberland%3Abrand%3AUGG%3Abrand%3AUnder+Armour%3Abrand%3AVans%3Abrand%3Aadidas%3Abrand%3Aadidas+Originals"

qsp = {
    'currentPage': 1,
    'pageSize': 100,
    'timestamp': 3
}

container = []


for page_content in range(0,1500,60):
    qsp['currentPage'] = page_content
    res = requests.get(url,params=qsp,headers={"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"})
    soup = BeautifulSoup(res.text, 'lxml')
    for item in soup.select(".c-product-card a"):
        container.append("https://www.eastbay.com"+item['href'])

    for items in soup.select("script"):
        sauce = BeautifulSoup(items.text,"lxml")
        for elem in sauce.select(".c-product-card a"):
            container.append("https://www.eastbay.com"+elem['href'])

print(container)

这可能是一个网络,而val可能是两者之间边缘的权重。但是,我想在A和B之间以及B和A之间添加权重以得到以下信息:

col_a <- c("A","B","C")
col_b <- c("B","A","A")
val <- c(1,3,7)
df <- data.frame(col_a, col_b, val)
df

  col_a col_b val
1     A     B   1
2     B     A   3
3     C     A   7

new_col_a <- c("A", "A") new_col_b <- c("B", "C") new_val <- c(4,7) want_df <- data.frame(new_col_a, new_col_b, new_val) want_df new_col_a new_col_b new_val 1 A B 4 2 A C 7 中有办法做到这一点吗?

3 个答案:

答案 0 :(得分:3)

一种dplyr可能是:

df %>%
 mutate_if(is.factor, as.character) %>%
 group_by(grp = paste(pmin(col_a, col_b), pmax(col_a, col_b), sep = "_")) %>%
 summarise(val = sum(val))

  grp     val
  <chr> <dbl>
1 A_B       4
2 A_C       7

或者使用tidyverse,使用与@Sonny类似的想法:

df %>%
 mutate_if(is.factor, as.character) %>%
 nest(col_a, col_b) %>%
 group_by(grp = unlist(map(data, function(x) paste(sort(x), collapse = "_")))) %>%
 summarise(val = sum(val))

如果您还希望将其分为两列(此步骤还需要tidyr):

df %>%
 mutate_if(is.factor, as.character) %>%
 group_by(grp = paste(pmin(col_a, col_b), pmax(col_a, col_b), sep = "_")) %>%
 summarise(val = sum(val)) %>%
 separate(grp, c("new_col_a", "new_col_b"), sep = "_")

  new_col_a new_col_b   val
  <chr>     <chr>     <dbl>
1 A         B             4
2 A         C             7

或者在第二种可能性的情况下:

df %>%
 mutate_if(is.factor, as.character) %>%
 nest(col_a, col_b) %>%
 group_by(grp = unlist(map(data, function(x) paste(sort(x), collapse = "_")))) %>%
 summarise(val = sum(val)) %>%
 separate(grp, c("new_col_a", "new_col_b"), sep = "_")

答案 1 :(得分:2)

您可以为此使用dplyr

df <- data.frame(col_a, col_b, val, stringsAsFactors = F)

library(dplyr)
library(tidyr)
df %>% 
  mutate(
    pair = purrr::pmap_chr(
      .l = list(from = col_a, to = col_b),
      .f = function(from, to) paste(sort(c(from, to)), collapse = "_")
    )
  ) %>%
  group_by(pair) %>%
  summarise(new_val = sum(val)) %>%
  separate(pair, c("new_col_a", "new_col_b"), sep = "_")
  # A tibble: 2 x 3
  new_col_a new_col_b new_val
  <chr>     <chr>       <dbl>
1 A         B               4
2 A         C               7

类似于我以前的answers

答案 2 :(得分:0)

如果首先将数据整理成整齐的长格式,那么它将变得相当简单。转换为长整数,将列标签与val的位置,组,列和val的总和无关地进行排序:

df %>%
    gather(grp,col,-val) %>%
    mutate(col=col[order(col,grp)]) %>%
    spread(grp,col) %>%
    group_by(col_a, col_b) %>%
    summarize(val = sum(val))

## A tibble: 2 x 3
## Groups:   col_a [?]
#  col_a col_b   val
#  <chr> <chr> <dbl>
#1 A     B         4
#2 A     C         7