Question

假设我在R中有一个很大的数据框，其中列出了商品编号和品牌编号。

每个品牌可以有多个商品编号，但是商品编号只能有一个对应的品牌编号。

是否有一种快速的方法来计算和替换缺失的品牌条目（NA值），如下所示，而留下NA值，在其中找不到准确的替换项。

import java.io.BufferedReader;
    import java.io.IOException;
    import java.io.InputStreamReader;
    import java.util.Arrays;
    import java.util.Stack;

    public class Main {
        public static void main(String[] args) throws IOException {
            BufferedReader br = new BufferedReader(new InputStreamReader(System.in));
            String[] firstLine = br.readLine().split(" ");
            int n = Integer.parseInt(firstLine[0]);
            int m = Integer.parseInt(firstLine[1]);
            int[][] matrix = new int[n][m];
            boolean[][] visitedElements = new boolean[n][m];

            for (int row = 0; row < n; row++) {
                String[] line = br.readLine().split("\\s");
                matrix[row] = Arrays.stream(line).mapToInt(Integer::parseInt).toArray();
            }

            int maxCounter = 0;
            for (int row = 0; row < n; row++) {
                for (int col = 0; col < m; col++) {
                    if (!visitedElements[row][col]) {
                        maxCounter = Math.max(maxCounter, countAreaInMatrixDFS(row, col, matrix, visitedElements, n, m));
                    }
                }
            }

            System.out.println(maxCounter);
        }

        private static int countAreaInMatrixDFS(int row, int col, int[][] matrix, boolean[][] checkedElements, int maxRowIndex, int maxColIndex) {
            Stack<Integer[]> stack = new Stack<>();
            stack.push(new Integer[]{row, col});
            checkedElements[row][col] = true;
            int counter = 1;

            while (stack.size() > 0) {
                Integer[] elementIndexes = stack.pop();
                row = elementIndexes[0];
                col = elementIndexes[1];

                int[][] neighboursIndexes = getNeighbourNodes(row, col, maxRowIndex, maxColIndex);
                for (int[] indexes : neighboursIndexes) {
                    int neighbourRow = indexes[0];
                    int neighbourCol = indexes[1];
                    if (!checkedElements[neighbourRow][neighbourCol] && matrix[row][col] == matrix[neighbourRow][neighbourCol]) {
                        stack.push(new Integer[]{neighbourRow, neighbourCol});
                        checkedElements[neighbourRow][neighbourCol] = true;
                        counter++;
                    }
                }
            }

            return counter;
        }

        private static int[][] getNeighbourNodes(int rowIndex, int colIndex, int maxRowIndex, int maxColIndex) {
            int[][] indexes = new int[4][];

            if (indexExists(rowIndex - 1, colIndex, maxRowIndex, maxColIndex)) {
                indexes[0] = new int[]{rowIndex - 1, colIndex};
            } else {
                indexes[0] = new int[]{maxRowIndex - 1, colIndex};
            }

            if (indexExists(rowIndex + 1, colIndex, maxRowIndex, maxColIndex)) {
                indexes[1] = new int[]{rowIndex + 1, colIndex};
            } else {
                indexes[1] = new int[]{0, colIndex};
            }

            if (indexExists(rowIndex, colIndex - 1, maxRowIndex, maxColIndex)) {
                indexes[2] = new int[]{rowIndex, colIndex - 1};
            } else {
                indexes[2] = new int[]{rowIndex, maxColIndex - 1};
            }

            if (indexExists(rowIndex, colIndex + 1, maxRowIndex, maxColIndex)) {
                indexes[3] = new int[]{rowIndex, colIndex + 1};
            } else {
                indexes[3] = new int[]{rowIndex, 0};
            }

            return indexes;
        }

        private static boolean indexExists(int row, int col, int maxRowIndex, int maxColIndex) {
            return row >= 0 && col >= 0 && row < maxRowIndex && col < maxColIndex;
        }
    }

trainset

trainsetresult

更多信息：我正在研究的数据框中的项目编号和品牌编号超过了数百。

谢谢！

Answer 1

一种方法是使用联接，让我们尝试data.table：

library(data.table)

setDT(trainset)

trainset[!is.na(brand),][trainset, on = "item", .(brand, item)]

输出：

    brand item
 1:     1   50
 2:     2  100
 3:     2  100
 4:     3  300
 5:     2  200
 6:     2  100
 7:     2  100
 8:     2  100
 9:     2  100
10:     2  100
11:     2  100
12:     1   50
13:     2  200
14:     3  300
15:     2  100
16:     2  100
17:     1   50
18:     2  200
19:    NA  900

您也可以使用minimum或maximum来代替，例如用dplyr：

library(dplyr)

trainset %>%
  group_by(item) %>%
  mutate(
    brand = ifelse(all(is.na(brand)), NA, min(brand, na.rm = T))
  )

另一种方法是将dplyr与zoo结合使用，但是我认为这太过分了：

trainset %>%
  group_by(item) %>%
  arrange(brand) %>%
  mutate(
    brand = zoo::na.locf(brand, na.rm = FALSE)
  )

Answer 2

尝试以下方法。创建仅包含完整且唯一的行的数据框后，它使用match获取该临时数据框中item列的位置。

tmp <- trainset[complete.cases(trainset), ]
tmp <- tmp[!duplicated(tmp), ]

i <- match(trainset$item, tmp$item)
trainset$brand <- tmp$brand[i]

检查是否有效。

all.equal(trainset, trainsetresult)
#[1] TRUE

最终清理。

rm(tmp, i)

通过其他列中的给定信息替换R中的缺失值

2 个答案: