筛选数据框中的每一行并手动对其进行分类

时间:2016-06-29 14:30:50

标签: r dataframe

有人可以推荐一种有效的方法来筛选数据框中的每一行并手动对其进行分类吗?例如,我可能想要将垃圾邮件与电子邮件分开,或者列出招聘广告,求职者或约会机构档案(我知道Tinder通过向左或向右滑动来实现这一点)。

我的数据集足够小,可以手动分类。我想如果它更大,我可能只想手动对其中的一部分进行分类,以便训练机器学习算法,如Naive Bayes,为我完成任务。

我会告诉你我现在所拥有的东西,但这不是一项特别原创的任务,所以必须有一种不那么粗暴的方式来做到这一点,有人已经想到了! (作为一个新手,我对R的力量印象深刻,但是当清除屏幕或捕获击键等小任务变得非常重要时,我也感到困惑)

# Let us suppose I am using this built-in dataset to draw up a
# shortlist of where I might wish to go on holiday
df <- data.frame(state.x77);

# pp - define a task-specific pretty print function
pp <- function(row) {
    print(row); # Example dataset is simple enough to just print the entire row
}

# cls - clear the screen (this hack works on Windows but I've commented it for now)
cls <- function() {
    #system("powershell -ExecutionPolicy Bypass -command (New-Object -ComObject Wscript.Shell).SendKeys([string][char]12)");
}

# It would halve the number of keystrokes needed if I knew a way to read
# a single character
readcharacter <- readline;

sift <- function(df, pp)
{
    classification = rep('', nrow(df));

    for (nRow in 1:nrow(df))
    {
        cls();
        pp(df[nRow,]);
        cat("\nEnter 'a' to discard, 'd' to keep, 'q' to quit\n");

        char <- '';
        while (char != 'a' && char != 'd' && char != 'q') {
            char <- readcharacter();
        }

        if (char == 'q')
            break;

        classification[nRow] = char;
    }

    return(cbind(df,classification=classification));
}

result = sift(df, pp);

cls();
cat("Shortlist:\n");
print(row.names(result[result$classification=='d',]));

1 个答案:

答案 0 :(得分:0)

那么StackOverflow社区如何使用this Shiny app来解决我的问题?我不希望看到Shiny在数据分析的早期部分中使用 - 通常只有在我们有一些我们想要动态探索或呈现的结果时它才会发挥作用。

学习Shiny很有趣也很有用,但如果能找到一个不太复杂的答案,我会更喜欢它。

library(shiny);

#
# shortlist - function that allows us to shortlist through the rows in a data frame efficiently
#
shortlist <- function(df, sTitle, sRowName) {

    createUI <- function() {

        listHeading <- list(
                    textOutput(outputId = "Progress"),
                    tags$br(),
                    fluidRow(
                        column(width=1, sRowName),
                        column(width=9, textOutput(outputId = "RowName"))));

        listFields <- lapply(names(df), function(sFieldname) {

            return(fluidRow(
                column(width=1, sFieldname),
                column(width=9, textOutput(outputId = sFieldname))));
        });

        listInputs <- list(
                    tags$br(),
                    tags$table(
                        tags$tr(
                            tags$td(" "),
                            tags$td(actionButton(inputId="Up", label="W", disabled=TRUE, width="100%"))),
                        tags$tr(
                            tags$td(width="100px", actionButton(inputId="Discard", label="Discard, A", width="100%")),
                            tags$td(width="100px", actionButton(inputId="Down", label="S", disabled=TRUE, width="100%")),
                            tags$td(width="100px", actionButton(inputId="Keep", label="Keep, D", width="100%")))),
                        tags$script("

                            // JavaScript implemented keyboard shortcuts, including lots of conditions to
                            // ensure we're finished processing one keystroke before we start the next.

                            var bReady = false;

                            $(document).on('shiny:recalculating', function(event) {
                                bReady = false;
                            });

                            $(document).on('shiny:recalculated', function(event) {
                                setTimeout(function() {bReady = true;}, 500);
                            });

                            $(document).on('keypress', function(event) {

                                if (bReady) {

                                    switch(event.key.toLowerCase()) {
                                    case 'a':
                                        document.getElementById('Discard').click();
                                        bReady = false;
                                        break;
                                    case 'd':
                                        document.getElementById('Keep').click();
                                        bReady = false;
                                        break;
                                    }
                                }
                            });

                            // End of JavaScript

                        "));

        listPanel <- list(
                    title = sTitle,
                    tags$br(),
                    conditionalPanel(
                        condition = paste("input.Keep + input.Discard <", nrow(df)),
                        append(append(listHeading, listFields), listInputs)));

        listShortlist <- list(
                    tags$hr(),
                    tags$h4("Shortlist:"),
                    dataTableOutput(outputId="Shortlist"));

        ui <- do.call(fluidPage, append(listPanel, listShortlist));

        return(ui);
    }

    app <- shinyApp(ui = createUI(), server = function(input, output) {

        classification <- rep('', nrow(df));

        getRow <- reactive({

            return (input$Keep + input$Discard + 1);
        });

        classifyRow <- function(nRow, char) {

            if (nRow <= nrow(df)) {
                classification[nRow] <<- char;
            }

            # In interactive mode, automatically stop the app when we're finished
            if ( interactive() && nRow >= nrow(df) ) {
                stopApp(classification);
            }
        }

        observeEvent(input$Discard, {classifyRow(getRow() - 1, 'a')});
        observeEvent(input$Keep,    {classifyRow(getRow() - 1, 'd')});

        output$Progress = renderText({paste("Showing record", getRow(), "of", nrow(df))});
        output$RowName  = renderText({row.names(df)[getRow()]});

        lapply(names(df), function(sFieldname) {
            output[[sFieldname]] <- renderText({df[getRow(), sFieldname]});
        });

        output$Shortlist <- renderDataTable(options = list(paging = FALSE, searching = FALSE), {

            # Mention the 'keep' input to ensure this code is called when the 'keep' button
            # is pressed.  That way the shortlist gets updated when an item to be added to it.
            dummy <- input$Keep;

            # Construct the shortlist
            shortlist <- data.frame(row.names(df[classification == 'd',]));
            colnames(shortlist) <- sRowName;
            return(shortlist);
        });

    });

    if (interactive()) {
        classification <- runApp(app);

        return(cbind(df, classification = classification));
    } else {
        return(app);
    }
}

#
# And now some example code.
# Shortlist the built in state.x77 data set (let us suppose I am drawing up
# a shortlist of where I might wish to go on holiday)
#

df <- data.frame(state.x77);

result <- shortlist(df = df, "Choose states", "State");

if (interactive()) {
    cat("Shortlist:\n");
    print(row.names(result[result$classification == 'd',]));
} else {
    return (result);
}