删除< 0值&删除行

时间:2014-10-23 18:26:58

标签: r

编辑,添加了以下代码

首先,抱歉,我现在无法想出一个好的可重现的例子,但我认为我的问题可以在没有它的情况下得到解答。

我的数据涉及从手动操作的测试机器获取的一些线图。 因为它是手动操作的,所以我们得到可变的开始时间,因此数据没有正确地重叠"彼此

以前通过使用以下代码解决了这个问题:

#import data
x <- read.csv("smoke.csv", head=T, sep=",")

#flag '0' values, remove all zero values
row_sub = apply(x, 1, function(row) all(row > 0))
y <- x[row_sub,]

之前由于样本量小且时间相对紧密而起作用。随着更多的样本,我现在得到了一些“剪裁”#39;在图表中:

picture

我没有专家,请原谅解释:&#39; row_sub&#39;是&#39; x&#39;的修改版本只保留所有值都是>的行。 0

附图right here说明了这个问题。 我们可以看到第一个样品是可以的,因为它可能花费最长时间插入设备中。但是操作员在整个测试过程中变得更好,减少了样品进料时间,导致样品4出现极端剪切。

我知道我可以通过简单地删除每个样本的前导零值,然后剪切所有数据的尾端以确保它们都具有相同的数据点来轻松地手动执行此操作。但我无法弄清楚如何在R中做到这一点。

修改 以下是数据:http://pastebin.com/iEW4sH2a

# Check & load required packages 
if (require("grid") == FALSE) install.packages("grid")
if (require("ggplot2") == FALSE) install.packages("ggplot2")
if (require("gridExtra") == FALSE) install.packages("gridExtra")
if (require("flux") == FALSE) install.packages("flux")
if (require("matrixStats") == FALSE) install.packages("matrixStats")
if (require("mgcv") == FALSE) install.packages("mgcv")


# Set working directory, read datafile
setwd("C location here")
x <- read.csv("smoke.csv", head=T, sep=",")
# Remove 'time' column

# flag '0' values, remove zero values
row_sub = apply(x, 1, function(row) all(row > 0, na.rm=TRUE))
y <- x[row_sub,]
rownames(y) <- NULL

# create time axis with appropriate length & attach to df
time <- seq(0,120, by=0.2)
time <- time[0:nrow(y)]
z <- cbind(time, y)
z <- na.omit(z)

#graph parameters
y_max <- 5.0

a.means <- rowMeans(z[,2:5])
b.means <- rowMeans(z[,6:9])
c.means <- rowMeans(z[,10:13])
d.means <- rowMeans(z[,14:17])

all.data <- cbind(z, a.means, b.means, c.means, d.means)

# Multiple plot function
#
# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
# - cols:   Number of columns in layout
# - layout: A matrix specifying the layout. If present, 'cols' is ignored.
#
# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
# then plot 1 will go in the upper left, 2 will go in the upper right, and
# 3 will go all the way across the bottom.
#
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
    require(grid)

    # Make a list from the ... arguments and plotlist
    plots <- c(list(...), plotlist)

    numPlots = length(plots)

    # If layout is NULL, then use 'cols' to determine layout
    if (is.null(layout)) {
        # Make the panel
        # ncol: Number of columns of plots
        # nrow: Number of rows needed, calculated from # of cols
        layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                         ncol = cols, nrow = ceiling(numPlots/cols))
    }

    if (numPlots==1) {
        print(plots[[1]])

    } else {
        # Set up the page
        grid.newpage()
        pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))

        # Make each plot, in the correct location
        for (i in 1:numPlots) {
            # Get the i,j matrix positions of the regions that contain this subplot
            matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))

            print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                            layout.pos.col = matchidx$col))
        }
    }
}

#calculate area under curve
a.auc <- round(auc(z$time, a.means),2)
b.auc <- round(auc(z$time, b.means),2)
c.auc <- round(auc(z$time, c.means),2)
d.auc <- round(auc(z$time, d.means),2)
# Prepare plots

a_graph <- ggplot(data=all.data, aes(time)) + 
    geom_point(aes(y=a1), alpha=0.1, color="indianred") + 
    geom_point(aes(y=a2), alpha=0.1, color="indianred1") + 
    geom_point(aes(y=a3), alpha=0.1, color="indianred2") + 
    geom_point(aes(y=a4), alpha=0.1, color="indianred3") +
    geom_line(aes(y=a.means), size=1, color="indianred4") +
    ggtitle("145A: Standard") +
    geom_text(aes(75, 1.5, label = a.auc)) +
    scale_x_continuous("Time(s)", limits=c(0,120)) +
    scale_y_continuous("Smoke(%Opacity)", limits=c(0,y_max))

b_graph <- ggplot(data=all.data, aes(time)) + 
    geom_point(aes(y=b1), alpha=0.1, color="chartreuse") + 
    geom_point(aes(y=b2), alpha=0.1, color="chartreuse1") + 
    geom_point(aes(y=b3), alpha=0.1, color="chartreuse2") + 
    geom_point(aes(y=b4), alpha=0.1, color="chartreuse3") +
    geom_line(aes(y=b.means), size=1, color="chartreuse4") +
    ggtitle("145B: +0.5%") +
    geom_text(aes(75, 1.5, label = b.auc)) +
    scale_x_continuous("Time(s)", limits=c(0,120)) +
    scale_y_continuous("Smoke(%Opacity)", limits=c(0,y_max))

c_graph <- ggplot(data=all.data, aes(time)) + 
    geom_point(aes(y=c1), alpha=0.1, color="turquoise") + 
    geom_point(aes(y=c2), alpha=0.1, color="turquoise1") + 
    geom_point(aes(y=c3), alpha=0.1, color="turquoise2") + 
    geom_point(aes(y=c4), alpha=0.1, color="turquoise3") +
    geom_line(aes(y=c.means), size=1, color="turquoise4") +
    ggtitle("145C: +1.0%") +
    geom_text(aes(75, 1.5, label = c.auc)) +
    scale_x_continuous("Time(s)", limits=c(0,120)) +
    scale_y_continuous("Smoke(%Opacity)", limits=c(0,y_max))

d_graph <- ggplot(data=all.data, aes(time)) + 
    geom_point(aes(y=d1), alpha=0.1, color="indianred") + 
    geom_point(aes(y=d2), alpha=0.1, color="indianred1") + 
    geom_point(aes(y=d3), alpha=0.1, color="indianred2") + 
    geom_point(aes(y=d4), alpha=0.1, color="indianred3") +
    geom_line(aes(y=d.means), size=1, color="indianred4") +
    ggtitle("145A: Standard") +
    geom_text(aes(75, 1.5, label = d.auc)) +
    scale_x_continuous("Time(s)", limits=c(0,120)) +
    scale_y_continuous("Smoke(%Opacity)", limits=c(0,y_max))

sample_names <- as.data.frame(c("145A", "145B", "145C", "145D"))
sample_auc <- as.data.frame(c(a.auc, b.auc, c.auc, d.auc))
sample_all <- as.data.frame(cbind(sample_names,sample_auc))
colnames(sample_all) <- c("x","y")

multiplot(a_graph, b_graph, c_graph, d_graph, cols=2)

2 个答案:

答案 0 :(得分:1)

我仍然没有100%确定我理解这个问题,但我认为我更了解它。

根据我的理解,除了时间之外,您的数据列会以不同的金额向前移动,而您在开始时并不关心这些值。

如果是这种情况,您可以做的是定义一个阈值小值thresh,之后您要将数据视为每列中的开头,并在此之前丢弃所有内容。

## Untested ##
x <- lapply(x, as.numeric)
thresh <- 0.01
## store all indices until thresh is exceeded
ind2Rm <- lapply(x, function(col) 1:which(col > thresh)[1])
for(j in 2:length(x)) { # don't loop over time which is 1st column
  x[[j]] <- x[[j]][-ind2Rm[[j]]] # remove these first values that don't exceed thresh
}

在此之后,您需要将数据组合以绘制回数据框。由于列表元素的长度可能不同,因此可以通过填充每列末尾的NA来将它们组合成数据帧。有关一种方法,请参阅answer to this SO question

答案 1 :(得分:1)

也许这就是你想要的?

dt <- list(ax = x[c(1,grep("a", colnames(x)))], bx = x[c(1,grep("b", colnames(x)))], cx = x[c(1,grep("c", colnames(x)))], dx = x[c(1,grep("d", colnames(x)))])

z <- lapply(dt, function(k) {
    out <- k[apply(k[-1], 1, function(row) all(row > 0, na.rm=TRUE)),]
    out$time <- seq(from = 0, by = 0.2, length = nrow(out))
    out
    })

Reduce(function(x, y) merge(x, y, by="time", all = TRUE), z)