Question

在将此问题的逻辑与dplyr的逻辑进行匹配时，我遇到了一些麻烦。通常，如果要将一组减少为一个组，请使用summarise，而如果要为每行计算一个单独的编号，请使用mutate。但是，如果要对每行的组进行计算怎么办？

在下面的示例中，mloc包含一个指向pnum的指针，目标是添加一个新列nm_child，该列针对每一行计算{{1 }}指向pnum中组中行索引的值（即具有与该值相同的值）。如果使用嵌套循环，或者使用mloc（如果我知道如何迭代1）每个组，＆2）每个元素和3）返回映射输出作为组中的列，则很容易做到。

map

上面的library(tidyverse) ser <- c(1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2) pnum <- c(1:5, 1:6) mloc <- c(0, 2, 2, 0, 3, 1, 1, 0, 0, 3, 4) tb1 <- tibble(ser,pnum, mloc) tb2 <- tb1 %>% group_by(ser) %>% mutate(nm_child = sum(pnum == mloc))总是=1。我明白了为什么它不起作用，但我看不出为什么它能做到这一点。

我也尝试过

nm_child

（返回

mutate(nm_child = count(pnum == mloc))

和其他各种东西。我确实通过添加几列中间值并使用一堆嵌套的ifelse（）来完成一件事，但是要在我的900万行上运行需要20多分钟的时间-相比之下，例如回归和大多数简单的dplyr操作，它们之间的变化介于几秒钟之间，太快而无法注意到。

所需的输出：

no applicable method for 'groups' applied to an object of class "logical")

Answer 1

这是使用<dependency> <groupId>org.springframework</groupId> <artifactId>spring-tx</artifactId> <version>3.1.2.RELEASE</version> </dependency>-

的一种方法

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
    xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>


    <parent>
        <groupId>com.study</groupId>
        <artifactId>test_component</artifactId>
        <version>1.0-SNAPSHOT</version>
    </parent>


    <properties>
        <start-class>com.study.TestStarter</start-class>
        <java.version>1.8</java.version>
        <source>1.8</source>
        <target>1.8</target>
        <assembly_build_version>2.3.523</assembly_build_version>
        <descriptorRef>${svc_install_pkg}</descriptorRef>
        <!-- Additionally, Please make sure that your JAVA_HOME is pointing to 
            1.8 when building on commandline -->
        <skip_tomcat_bundle>false</skip_tomcat_bundle>

        <tomcat_bundle>tomcat85_24.tgz</tomcat_bundle>
    </properties>

    <dependencies>
        <!-- Add typical dependencies for a web application -->
        <!-- Adds Tomcat and Spring MVC, along others -->
        <!--dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-parent</artifactId> 
            <version>1.4.3.RELEASE</version> </dependency -->
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-web</artifactId>
            <version>1.4.3.RELEASE</version>
            <!-- <exclusions> <exclusion> <groupId>org.springframework.boot</groupId> 
                <artifactId>spring-boot-starter-tomcat</artifactId> </exclusion> </exclusions> -->
        </dependency>
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-tomcat</artifactId>
            <version>1.4.3.RELEASE</version>
            <scope>provided</scope>
        </dependency>
        <dependency>
            <groupId>org.apache.tomcat.embed</groupId>
            <artifactId>tomcat-embed-el</artifactId>
            <version>8.5.6</version>
            <scope>provided</scope>
        </dependency>
        <dependency>
            <groupId>javax.servlet</groupId>
            <artifactId>jstl</artifactId>
            <version>1.2</version>
        </dependency>
        <dependency>
            <groupId>com.fasterxml.jackson.dataformat</groupId>
            <artifactId>jackson-dataformat-xml</artifactId>
            <version>2.9.8</version>
        </dependency>
        <dependency>
            <groupId>it.unimi.dsi</groupId>
            <artifactId>fastutil</artifactId>
            <version>7.0.11</version>
        </dependency>
        <dependency>
            <groupId>com.google.guava</groupId>
            <artifactId>guava</artifactId>
            <version>19.0</version>
        </dependency>
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>4.12</version>
            <scope>test</scope>
        </dependency>
        <dependency>
            <groupId>org.springframework</groupId>
            <artifactId>spring-test</artifactId>
            <version>5.1.0.RELEASE</version>
            <scope>test</scope>
        </dependency>
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-test-autoconfigure</artifactId>
            <version>1.4.4.RELEASE</version>
            <scope>test</scope>
        </dependency>
        <dependency>
            <groupId>org.mockito</groupId>
            <artifactId>mockito-all</artifactId>
            <version>1.10.19</version>
            <scope>test</scope>
        </dependency>
        <dependency>
            <groupId>commons-io</groupId>
            <artifactId>commons-io</artifactId>
            <version>2.5</version>
            <scope>test</scope>
        </dependency>
    </dependencies>
    <build>
        <plugins>
            <plugin><!-- Include if you want to make an executable jar[FAT JAR which 
                    includes all dependencies along with sprinboot loader] that you can run on 
                    commandline using java -jar NAME -->
                <groupId>org.springframework.boot</groupId>
                <artifactId>spring-boot-maven-plugin</artifactId>
            </plugin>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-surefire-plugin</artifactId>
                <version>2.12.4</version>
            </plugin>
            <plugin>
                <groupId>org.jacoco</groupId>
                <artifactId>jacoco-maven-plugin</artifactId>
                <version>0.7.7.201606060606</version>
                <executions>
                    <execution>
                        <goals>
                            <goal>prepare-agent</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
            <plugin>
              <groupId>org.apache.maven.plugins</groupId>
              <artifactId>maven-war-plugin</artifactId>
              <version>2.2</version>
              <configuration>
                <failOnMissingWebXml>false</failOnMissingWebXml>
              </configuration>
            </plugin>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-jar-plugin</artifactId>
            </plugin>
            <plugin>
        <groupId>org.codehaus.mojo</groupId>
        <artifactId>build-helper-maven-plugin</artifactId>
        <version>3.0.0</version>
        <executions>
          <execution>
            <id>attach-artifacts</id>
            <phase>package</phase>
            <goals>
              <goal>attach-artifact</goal>
            </goals>
            <configuration>
              <artifacts>
                <artifact>
                  <file>target/hgvs-${project.version}.jar</file>
                  <type>jar</type>
                </artifact>
              </artifacts>
            </configuration>
          </execution>
        </executions>
      </plugin>
      <plugin>
            <groupId>org.apache.maven.plugins</groupId>
            <artifactId>maven-source-plugin</artifactId>
            <version>3.0.1</version>
            <executions>
              <execution>
                <id>attach-sources</id>
                <phase>verify</phase>
                <goals>
                  <goal>jar-no-fork</goal>
                </goals>
              </execution>
            </executions>
          </plugin>
            <plugin>
                <artifactId>maven-assembly-plugin</artifactId>
            </plugin>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-deploy-plugin</artifactId>
                <executions>
                    <execution>
                        <phase>deploy</phase>
                    </execution>
                </executions>
            </plugin>

    </plugins>
</build>

这是另一种方式，感谢@RonakShah-

sapply

更新：查看其他答案中的基准，@ thelatemail的答案肯定是最好的。

Answer 2

这是ser + mloc的聚合，然后左联接返回原始数据。不必遍历每个值：

tb1 %>%
  group_by(ser, mloc) %>%
  summarise(nm_child=n()) %>%
  left_join(tb1, ., by=c("ser"="ser","pnum"="mloc"))

## A tibble: 11 x 4
#     ser  pnum  mloc nm_child
#   <dbl> <dbl> <dbl>    <int>
# 1  1.00  1.00  0          NA
# 2  1.00  2.00  2.00        2
# 3  1.00  3.00  2.00        1
# 4  1.00  4.00  0          NA
# 5  1.00  5.00  3.00       NA
# 6  2.00  1.00  1.00        2
# 7  2.00  2.00  1.00       NA
# 8  2.00  3.00  0           1
# 9  2.00  4.00  0           1
#10  2.00  5.00  3.00       NA
#11  2.00  6.00  4.00       NA

这将更加有效：

# big example
tb1 <- tb1[rep(1:11,5e4),]
tb1$ser <- rep(1:1e5, rep(5:6,5e4))

system.time({
tb1 %>% 
  group_by(ser) %>% 
  mutate(
    nm_child = sapply(pnum, function(x) sum(x == mloc))
  )
})
#   user  system elapsed 
#   8.83    0.06    8.97     

system.time({
tb1 %>%
  group_by(ser, mloc) %>%
  summarise(nm_child=n()) %>%
  left_join(tb1, ., by=c("ser"="ser","pnum"="mloc"))
})
#   user  system elapsed 
#   0.67    0.02    0.69

在基本R逻辑中，这类似于：

tabu <- aggregate(cbind(nm_child=mloc) ~ ser + mloc, tb1, FUN=length)
merge(tb1, tabu, by.x=c("ser","pnum"), by.y=c("ser","mloc"), all.x=TRUE)

并在data.table中将其四舍五入，这将再快一个数量级：

tb1[tb1[, .N, by=.(ser,mloc)], on=c("ser","pnum"="mloc"), nm_child := N]

Answer 3

您可以使用outer和rowSums

tb1 %>% 
  group_by(ser) %>% 
  mutate(nm_child = rowSums(outer(pnum, mloc, `==`)))

# # A tibble: 11 x 4
# # Groups:   ser [2]
#      ser  pnum  mloc nm_child
#    <dbl> <int> <dbl>    <dbl>
#  1     1     1     0        0
#  2     1     2     2        2
#  3     1     3     2        1
#  4     1     4     0        0
#  5     1     5     3        0
#  6     2     1     1        2
#  7     2     2     1        0
#  8     2     3     0        1
#  9     2     4     0        1
# 10     2     5     3        0
# 11     2     6     4        0

使用最新邮件的示例数据进行基准测试

tb1 <- tb1[rep(1:11,5e4),]
tb1$ser <- rep(1:1e5, rep(5:6,5e4))

tb2 <- as.data.table(tb1)

library(microbenchmark)

microbenchmark(
  sapply = {
    tb1 %>% 
      group_by(ser) %>% 
      mutate(
        nm_child = sapply(pnum, function(x) sum(x == mloc))
      )
  },
  join = {
    tb1 %>%
      group_by(ser, mloc) %>%
      summarise(nm_child=n()) %>%
      left_join(tb1, ., by=c("ser"="ser","pnum"="mloc"))
  },
  outer1 = {
    tb1 %>% 
      group_by(ser) %>% 
      mutate(nm_child = rowSums(outer(pnum, mloc, `==`)))
  },
  outer2 = {
    tb1 %>% 
      group_by(ser) %>% 
      mutate(nm_child = colSums(outer(mloc, pnum, `==`)))
  },
  data.table = {
    tb2[tb2[, .N, by=.(ser,mloc)], on=c("ser","pnum"="mloc"), nm_child := N][]
    },
  times = 10)

基准输出

# Unit: milliseconds
#        expr       min        lq      mean    median        uq        max neval
#      sapply 8233.5740 8297.7331 8939.9369 8647.5935 8956.3364 10706.3362    10
#        join  889.6682  899.0483  935.7493  908.1441  932.2827  1135.8424    10
#      outer1 4551.0428 4631.1605 5184.9359 4986.7327 5160.0109  7563.4190    10
#      outer2 4495.9134 4552.1169 4763.5954 4723.7783 4893.2190  5198.4556    10
#  data.table  108.7449  115.7866  124.4453  120.6742  125.7591   171.8111    10

涉及行特定元素和整个组元素的特定组计算

3 个答案: