允许用户选择要分析的数据文件范围?

时间:2010-09-09 09:51:49

标签: xml r

我有以下XML文件:

<Company >
    <shareprice>
        <timeStamp> 12:00:00.01</timeStamp>
        <Price>  25.02</Price>
    </shareprice>

    <shareprice>
        <timeStamp> 12:00:00.02</timeStamp>
        <Price>  15</Price>
    </shareprice>

    <shareprice>
        <timeStamp> 12:00:01.025</timeStamp>
        <Price>  15.02</Price>
    </shareprice>

    <shareprice>
        <timeStamp> 12:00:01.031</timeStamp>
        <Price>  18.25</Price>
    </shareprice>

    <shareprice>
        <timeStamp> 12:00:01.039</timeStamp>
        <Price>  18.54</Price>
    </shareprice>

    <shareprice>
        <timeStamp> 12:00:01.050</timeStamp>
        <Price> 16.52</Price>
    </shareprice>

    <shareprice>
        <timeStamp> 12:00:02.01</timeStamp>
        <Price>  17.50</Price>
    </shareprice>

    <shareprice>
        <timeStamp> 12:00:03.01</timeStamp>
        <Price>  25.02</Price>
    </shareprice>

    <shareprice>
        <timeStamp> 12:00:05.02</timeStamp>
        <Price>  30</Price>
    </shareprice>

    <shareprice>
        <timeStamp> 12:00:11.025</timeStamp>
        <Price>  32.25</Price>
    </shareprice>

    <shareprice>
        <timeStamp> 12:00:12.031</timeStamp>
        <Price>  26.05</Price>
    </shareprice>

    <shareprice>
        <timeStamp> 12:00:15.039</timeStamp>
        <Price>  18.54</Price>
    </shareprice>

    <shareprice>
        <timeStamp> 12:00:19.050</timeStamp>
        <Price> 16.52</Price>
    </shareprice>

    <shareprice>
        <timeStamp> 12:01:02.01</timeStamp>
        <Price>  17.50</Price>
    </shareprice>
</Company>

我还有以下R代码:

library (ggplot2)
library (XML)
df <- xmlToDataFrame(file.choose()) 
df$timeStamp <- strptime(as.character(df$timeStamp), "%H:%M:%OS")
df$Price <- as.numeric(as.character(df$Price))
sapply(df, class)          
options("digits.secs"=3)   
summary (df)              
df$timeStamp <- df[1,"timeStamp"] + cumsum(runif(1:length(df$timeStamp))*60)
summary(df)
diff1 = 0
diff <- append(diff1,diff(df$Price))
summary (df$Price)
Ymin <- min(df$Price)
Ymax <- max(df$Price)
Ymedian <- median (df$Price)
Ymean <- mean(df$Price)
Ysd <- sd (df$Price)
sink (file="c:/xampp/htdocs/Sharedata.xml", type="output",split=FALSE)
cat("<graph caption=\"Share Data Wave\" subcaption=\"For Person's Name\"   xAxisName=\"Time\" yAxisMinValue=\"-0.025\" yAxisName=\"Voltage\" decimalPrecision=\"5\"  formatNumberScale=\"0\" numberPrefix=\"\" showNames=\"1\" showValues=\"0\" showAlternateHGridColor=\"1\" AlternateHGridColor=\"ff5904\" divLineColor=\"ff5904\" divLineAlpha=\"20\" alternateHGridAlpha=\"5\">\n")
cat(sprintf("    <set name=\"%s\" value=\"%f\" hoverText = \"The difference from last value: %s\" ></set>\n", df$timeStamp, df$Price, diff))
cat ("</graph>\n")
unlink("data.xml")
sink (file="c:/xampp/htdocs/Sharesstatistics.xml", type="output",split=FALSE)
cat ("  <statistics>\n")
cat (sprintf("    <mean>%s</mean>\n", Ymean))
cat (sprintf("    <sd>%s</sd>\n",Ysd))
cat (sprintf("    <min>%s</min>\n", Ymin))
cat (sprintf("    <median>%s</median>\n",Ymedian))
cat (sprintf("    <max>%s</max>\n", Ymax))
cat ("  </statistics>\n")
unlink("statistics.xml")
quit()

R代码可以完成我想要的全部文件。我的问题涉及如何让用户选择一个输入文件的范围来分析而不是整个文件,这将如何做?例如,如果用户只想输入xml文件的第2到第5个输入,并保持cat语句定义的相同输出。

<shareprice>
        <timeStamp> 12:00:00.02</timeStamp>
        <Price>  15</Price>
    </shareprice>

    <shareprice>
        <timeStamp> 12:00:01.025</timeStamp>
        <Price>  15.02</Price>
    </shareprice>

    <shareprice>
        <timeStamp> 12:00:01.031</timeStamp>
        <Price>  18.25</Price>
    </shareprice>

所有人都非常感谢。

此致

安东尼。

1 个答案:

答案 0 :(得分:1)

只需读取数据框然后要求用户使用例如扫描(n = 2)给出较低的上限记录,就可以轻松解决这个问题。另见?扫描。它允许您以交互方式提供输入,因此用户可以选择要执行的操作。这是输入要使用的数据范围的一种情况。

x <- scan(n=2)
id <- min(x):max(x)

df2 <- df[id,]

如果您只想从非常大的XML表中读取必填字段,那就是另一个故事。我无法想到这样做的内置函数,所以你必须按照以下方式做一些事情:

# function reads a subset of an xml file,
# assuming a white line is dividing the individual records.
# n is a vector containing the record numbers wanted

subset.xml <- function(x,n,...){
    # set a range if n is just a number
    if (length(n)==1) n <- 1:n

    #initiate vars
    skp <- 0 # the number of lines to skip by scan
    count <- 1
    out <- character(1)

  repeat{
      tmp <- scan(x,what=character(0),n=1,skip=skp,blank.lines.skip=F,sep="\n")
      skp <- skp+1
      if(length(tmp)==0) {break} # no more input

      if((count %in% n) & (tmp !="")) out <- paste(out,tmp,sep="\n")
      if(tmp=="") count <- count+1 # white line seperates records
  }
  out <- substring(out,3)
  out <- paste("<Data>",out,"</Data>",sep="\n")
  return(xmlToDataFrame(xmlParse(out)))
}

df <- subset.xml("test.xml",2:4)
> df
      timeStamp   Price
1   12:00:00.02      15
2  12:00:01.025   15.02
3  12:00:01.031   18.25