说我有一些网络数据,如下所示:
url = "https://www.eastbay.com/category/mens/shoes.html?query=%3Arelevance%3Agender%3A200000%3AproductType%3A200005%3Abrand%3AChampion%3Abrand%3AConverse%3Abrand%3AFila%3Abrand%3AJordan%3Abrand%3ANew+Balance%3Abrand%3ANike%3Abrand%3ANike+SB%3Abrand%3APUMA%3Abrand%3AReebok%3Abrand%3ASalomon%3Abrand%3AThe+North+Face%3Abrand%3ATimberland%3Abrand%3AUGG%3Abrand%3AUnder+Armour%3Abrand%3AVans%3Abrand%3Aadidas%3Abrand%3Aadidas+Originals"
qsp = {
'currentPage': 1,
'pageSize': 100,
'timestamp': 3
}
container = []
for page_content in range(0,1500,60):
qsp['currentPage'] = page_content
res = requests.get(url,params=qsp,headers={"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"})
soup = BeautifulSoup(res.text, 'lxml')
for item in soup.select(".c-product-card a"):
container.append("https://www.eastbay.com"+item['href'])
for items in soup.select("script"):
sauce = BeautifulSoup(items.text,"lxml")
for elem in sauce.select(".c-product-card a"):
container.append("https://www.eastbay.com"+elem['href'])
print(container)
这可能是一个网络,而val可能是两者之间边缘的权重。但是,我想在A和B之间以及B和A之间添加权重以得到以下信息:
col_a <- c("A","B","C")
col_b <- c("B","A","A")
val <- c(1,3,7)
df <- data.frame(col_a, col_b, val)
df
col_a col_b val
1 A B 1
2 B A 3
3 C A 7
在new_col_a <- c("A", "A")
new_col_b <- c("B", "C")
new_val <- c(4,7)
want_df <- data.frame(new_col_a, new_col_b, new_val)
want_df
new_col_a new_col_b new_val
1 A B 4
2 A C 7
中有办法做到这一点吗?
答案 0 :(得分:3)
一种dplyr
可能是:
df %>%
mutate_if(is.factor, as.character) %>%
group_by(grp = paste(pmin(col_a, col_b), pmax(col_a, col_b), sep = "_")) %>%
summarise(val = sum(val))
grp val
<chr> <dbl>
1 A_B 4
2 A_C 7
或者使用tidyverse
,使用与@Sonny类似的想法:
df %>%
mutate_if(is.factor, as.character) %>%
nest(col_a, col_b) %>%
group_by(grp = unlist(map(data, function(x) paste(sort(x), collapse = "_")))) %>%
summarise(val = sum(val))
如果您还希望将其分为两列(此步骤还需要tidyr
):
df %>%
mutate_if(is.factor, as.character) %>%
group_by(grp = paste(pmin(col_a, col_b), pmax(col_a, col_b), sep = "_")) %>%
summarise(val = sum(val)) %>%
separate(grp, c("new_col_a", "new_col_b"), sep = "_")
new_col_a new_col_b val
<chr> <chr> <dbl>
1 A B 4
2 A C 7
或者在第二种可能性的情况下:
df %>%
mutate_if(is.factor, as.character) %>%
nest(col_a, col_b) %>%
group_by(grp = unlist(map(data, function(x) paste(sort(x), collapse = "_")))) %>%
summarise(val = sum(val)) %>%
separate(grp, c("new_col_a", "new_col_b"), sep = "_")
答案 1 :(得分:2)
您可以为此使用dplyr
df <- data.frame(col_a, col_b, val, stringsAsFactors = F)
library(dplyr)
library(tidyr)
df %>%
mutate(
pair = purrr::pmap_chr(
.l = list(from = col_a, to = col_b),
.f = function(from, to) paste(sort(c(from, to)), collapse = "_")
)
) %>%
group_by(pair) %>%
summarise(new_val = sum(val)) %>%
separate(pair, c("new_col_a", "new_col_b"), sep = "_")
# A tibble: 2 x 3
new_col_a new_col_b new_val
<chr> <chr> <dbl>
1 A B 4
2 A C 7
类似于我以前的answers
答案 2 :(得分:0)
如果首先将数据整理成整齐的长格式,那么它将变得相当简单。转换为长整数,将列标签与val
的位置,组,列和val
的总和无关地进行排序:
df %>%
gather(grp,col,-val) %>%
mutate(col=col[order(col,grp)]) %>%
spread(grp,col) %>%
group_by(col_a, col_b) %>%
summarize(val = sum(val))
## A tibble: 2 x 3
## Groups: col_a [?]
# col_a col_b val
# <chr> <chr> <dbl>
#1 A B 4
#2 A C 7