如何创建循环文件夹并返回表格和图表的knitr报告

时间:2013-10-22 13:00:18

标签: r loops latex knitr

我正在尝试生成一个质量控制报告,该报告在几个文件夹上循环(sapply)(每个文件夹对应一个实验),并且对于每个加载结果,创建表和图(在函数内)。生成的pdf应该包含文件夹的名称,然后按顺序包含表格和图表。我首先创建了R脚本(运行良好),然后创建了一个rnw文件。确实生成了这些图,但有2个问题(pdf outpout):

    块中的
  1. loop_n_plots没有生成表格;

  2. 创建完所有图后,会出现一条看似列表输出的无意混乱线。

  3. 问:如何获取pdf中的表格?在块“table_files”中生成的表有效,但apply函数内部的表没有。为什么?更一般地说,我正在尝试做什么(我是怎么做的)对knitr报告好吗?是否最好在列表中添加表格和图表,然后循环列表以打印它们?

    我现在已经使用了大块设置,但没有任何效果。

    示例代码:

    \documentclass{report}
    
    \begin{document}
    \title{Sequencing Quality Report}
    \author{Deep Sequencing Group - SFB655}
    \maketitle
    
    
    <<knitr_option, cache=FALSE, echo=FALSE, results='hide'>>=
    library(knitr)
    ## set global chunk options
    opts_chunk$set(fig.align='center', fig.width=14, fig.heigth=8, out.width="1.2\\textwidth",  par=TRUE)
    @
    
    
    <<R_arguments, cache=FALSE, echo=FALSE, include=FALSE>>=
    
    ###### Libraries ######
    library(reshape)
    library(ggplot2)
    theme_set(theme_bw(16)) # removes grey grid and increases letter size. Ideal for presentations
    library(RColorBrewer)
    library(plyr)
    library(scales) # for natural numbers in axis
    library(xtable)
    library(rattle) # needed to generate a table in knitr?
    #######################
    
    
    ###### Function definitions ######
    ## ggplot theme with extra space between legends and axis
    gg.axis.space <- theme(axis.title.y=element_text(vjust=0.2), axis.title.x=element_text(vjust=0.2))
    
    
    ReturnStatsPlotsAndTables <- function(fqc.folder){
    
       # for(fqc.folder in fq_fastqc.folders){
       ######################################
       ## for each folder in the vector will
       ## plot stats and 
       ## print tables of fastQC results
    
       ## which library is being analysed?
       fastq.lib <- data.frame(Libraries = gsub(".*/(L.*)\\.fq_fastqc", "\\1", fqc.folder, perl=T))
       xtable(fastq.lib)
    
       ## Basic statistics - table ##
       stats.path <- paste(fqc.folder, "/", "Basic_Statistics_fastqc_data.temp", sep="")
       basic.stats <- read.table(stats.path, header = TRUE, sep = "\t", stringsAsFactors = FALSE)
    #    basic.stats[ ,1:2]
       xtable(basic.stats[ ,1:2]) 
    
    
       ## Summary of filters - table ##
       stats.path <- paste(fqc.folder, "/", "filters_summary_fastqc_data.temp", sep="")
       summary.filters <-  read.table(stats.path, 
          header = TRUE, sep = "\t", stringsAsFactors = FALSE)
    #    summary.filters
       xtable(summary.filters)
    
    
       ## Per base sequence quality ##
       stats.path <- paste(fqc.folder, "/", "Per_base_sequence_quality_fastqc_data.temp", sep="")
       base.qual <- read.table(stats.path, 
          header = TRUE, sep = "\t", stringsAsFactors = FALSE)
    
    
       base.qual$Base <- factor(base.qual$Base, as.character(base.qual$Base)) # re-order the levels by order of appearance in DF
    
       plot.new()
       base.qual.p <- ggplot(base.qual, aes(x = Base, ymin = X10th.Percentile, lower = Lower.Quartile, middle = Median, upper = Upper.Quartile, ymax = X90th.Percentile, fill = Lower.Quartile)) + geom_boxplot(stat = "identity") + 
          theme(axis.text.x = element_text(angle=30, hjust=1, vjust=1)) + 
          annotate("rect", xmin=-Inf, xmax=Inf, ymin=0, ymax=20, alpha=0.1, fill="red") +
          annotate("rect", xmin=-Inf, xmax=Inf, ymin=20, ymax=28, alpha=0.1, fill="yellow") +
          annotate("rect", xmin=-Inf, xmax=Inf, ymin=28, ymax=Inf, alpha=0.1, fill="green") +
          ggtitle("Per base sequence quality") + ylab("Quality score (Phred score) ") + xlab("Position of base in read")
    
       print(base.qual.p)
    
    }
    @
    
    \chapter{Preamble}
    
    This an automated quality control report generated for the following fastq files:
    
    <<table_files, echo=FALSE, results="asis">>=
    ##############################################
    ## loop over fastQC folder and parse txt files:
    
    ## list and read fastqc_data.temp old files
    # testing #
    # setwd("/projects/seq-work/analysis/martinad/p0196-totalRNA/")
    folder <- "./"
    filenames <- list.files(path=folder, pattern="fastqc_data.temp", recursive=TRUE) 
    fq_fastqc.folders <- unique(dirname(filenames)) # the folders that contain fastQC
    fastq.libs <- data.frame(Libraries = gsub(".*/(L.*)\\.fq_fastqc", "\\1", fq_fastqc.folders, perl=T))
    xtable(fastq.libs)
    @
    
    
    
    \chapter{FastQC}
    
    <<loop_n_plots, echo=FALSE, results='asis'>>=
    ## do the plotting
    sapply(fq_fastqc.folders[1:3], ReturnStatsPlotsAndTables)
    @
    
    \end{document}
    

    函数ReturnStatsPlotsAndTables实际上更长,这足以让我们知道发生了什么。

1 个答案:

答案 0 :(得分:2)

找到有两个步骤的解决方案:

  1. 用包含函数ReturnStatsPlotsAndTables的指令的for循环替换sapply;

  2. 在for循环中,
  3. 需要使用以下方式显式打印表:

    打印(xtable(fastq.lib))

  4. 以下是最终代码:

    \documentclass{report}
    
    \begin{document}
    \title{Sequencing Quality Report}
    \author{Deep Sequencing Group - SFB655}
    \maketitle
    
    
    <<knitr_option, cache=FALSE, echo=FALSE, results='hide'>>=
    library(knitr)
    ## set global chunk options
    opts_chunk$set(fig.align='center', fig.width=14, fig.heigth=8, out.width="1.2\\textwidth",  par=TRUE)
    @
    
    
    <<R_arguments, cache=FALSE, echo=FALSE, include=FALSE>>=
    
    ###### Libraries ######
    library(reshape)
    library(ggplot2)
    theme_set(theme_bw(16)) # removes grey grid and increases letter size. Ideal for presentations
    library(RColorBrewer)
    library(plyr)
    library(scales) # for natural numbers in axis
    library(xtable)
    library(rattle) # needed to generate a table in knitr?
    #######################
    
    
    ###### Function definitions ######
    ## ggplot theme with extra space between legends and axis
    gg.axis.space <- theme(axis.title.y=element_text(vjust=0.2), axis.title.x=element_text(vjust=0.2))
    
    @
    
    
    \chapter{Preamble}
    
    This an automated quality control report generated for the following fastq files:
    
    <<table_files, echo=FALSE, results="asis">>=
    ##############################################
    ## loop over fastQC folder and parse txt files:
    
    ## list and read fastqc_data.temp old files
    # testing #
    # setwd("/projects/seq-work/analysis/martinad/p0196-totalRNA/")
    folder <- "./"
    filenames <- list.files(path=folder, pattern="fastqc_data.temp", recursive=TRUE) 
    fq_fastqc.folders <- unique(dirname(filenames)) # the folders that contain fastQC
    fastq.libs <- data.frame(Libraries = gsub(".*/(L.*)\\.fq_fastqc", "\\1", fq_fastqc.folders, perl=T))
    xtable(fastq.libs)
    @
    
    
    
    \chapter{FastQC}
    
    <<loop_n_plots, echo=FALSE, results="asis">>=
    ## do the plotting
    # sapply(fq_fastqc.folders[1:3], ReturnStatsPlotsAndTables)
    for (fqc.folder in fq_fastqc.folders[1:2]){
       # for(fqc.folder in fq_fastqc.folders){
       ######################################
       ## for each folder in the vector will
       ## plot stats and 
       ## print tables of fastQC results
    #    print(fqc.folder)
       ## which library is being analysed?
       fastq.lib <- data.frame(Libraries = gsub(".*/(L.*)\\.fq_fastqc", "\\1", fqc.folder, perl=T))
       print(xtable(fastq.lib))
    
       ## Basic statistics - table ##
       stats.path <- paste(fqc.folder, "/", "Basic_Statistics_fastqc_data.temp", sep="")
       basic.stats <- read.table(stats.path, header = TRUE, sep = "\t", stringsAsFactors = FALSE)
    #    basic.stats[ ,1:2]
       print(xtable(basic.stats[ ,1:2])) 
    
    
       ## Summary of filters - table ##
       stats.path <- paste(fqc.folder, "/", "filters_summary_fastqc_data.temp", sep="")
       summary.filters <-  read.table(stats.path, 
          header = TRUE, sep = "\t", stringsAsFactors = FALSE)
    #    summary.filters
       print(xtable(summary.filters))
    
    
       ## Per base sequence quality ##
       stats.path <- paste(fqc.folder, "/", "Per_base_sequence_quality_fastqc_data.temp", sep="")
       base.qual <- read.table(stats.path, 
          header = TRUE, sep = "\t", stringsAsFactors = FALSE)
    
    
       base.qual$Base <- factor(base.qual$Base, as.character(base.qual$Base)) # re-order the levels by order of appearance in DF
    
       plot.new()
       base.qual.p <- ggplot(base.qual, aes(x = Base, ymin = X10th.Percentile, lower = Lower.Quartile, middle = Median, upper = Upper.Quartile, ymax = X90th.Percentile, fill = Lower.Quartile)) + geom_boxplot(stat = "identity") + 
          theme(axis.text.x = element_text(angle=30, hjust=1, vjust=1)) + 
          annotate("rect", xmin=-Inf, xmax=Inf, ymin=0, ymax=20, alpha=0.1, fill="red") +
          annotate("rect", xmin=-Inf, xmax=Inf, ymin=20, ymax=28, alpha=0.1, fill="yellow") +
          annotate("rect", xmin=-Inf, xmax=Inf, ymin=28, ymax=Inf, alpha=0.1, fill="green") +
          ggtitle("Per base sequence quality") + ylab("Quality score (Phred score) ") + xlab("Position of base in read")
    
       print(base.qual.p)
    
    }
    @
    
    \end{document}