整洁奇怪的数据格式

时间:2018-03-26 15:31:35

标签: r

我得到了一个格式奇怪的数据文件,我将其读作:

df <- structure(
  list(
    X1 = c(
      "\"ID\"",
      "\"Parameter\"",
      "\"Year\"",
      "\"800\"",
      "\"799\"",
      "\"798\"",
      "\"797\"",
      "\"796\"",
      "\"795\""
    ),
    X2 = c(
      "\"001\"",
      "\"ap\"",
      "\"2016\"",
      "\"-0.000978013465745161\"",
      "\"-0.000853574674147712\"",
      "\"-0.000770681767403833\"",
      "\"-0.000762506834996983\"",
      "\"-0.000963651463931642\"",
      "\"-0.000839241421637097\""
    ),
    X3 = c(
      "\"002\"",
      "\"ap\"",
      "\"2016\"",
      "\"-0.000583552718375254\"",
      "\"-0.000495471744663315\"",
      "\"-0.000502488351223215\"",
      "\"-0.000589039598146738\"",
      "\"-0.000599887975678647\"",
      "\"-0.000471434015603837\""
    ),
    X4 = c(
      "\"003\"",
      "\"ap\"",
      "\"2016\"",
      "\"-0.000568187733836333\"",
      "\"-0.000527059984394067\"",
      "\"-0.000615318057111717\"",
      "\"-0.000592280468265934\"",
      "\"-0.000577707032763554\"",
      "\"-0.000569167407032334\""
    )
  ),
  .Names = c("X1", "X2", "X3", "X4"),
  row.names = c(NA,-9L),
  class = c("tbl_df", "tbl", "data.frame")
)


df
#>            X1                      X2                      X3
#> 1        "ID"                   "001"                   "002"
#> 2 "Parameter"                    "ap"                    "ap"
#> 3      "Year"                  "2016"                  "2016"
#> 4       "800" "-0.000978013465745161" "-0.000583552718375254"
#> 5       "799" "-0.000853574674147712" "-0.000495471744663315"
#> 6       "798" "-0.000770681767403833" "-0.000502488351223215"
#> 7       "797" "-0.000762506834996983" "-0.000589039598146738"
#> 8       "796" "-0.000963651463931642" "-0.000599887975678647"
#> 9       "795" "-0.000839241421637097" "-0.000471434015603837"
#>                        X4
#> 1                   "003"
#> 2                    "ap"
#> 3                  "2016"
#> 4 "-0.000568187733836333"
#> 5 "-0.000527059984394067"
#> 6 "-0.000615318057111717"
#> 7 "-0.000592280468265934"
#> 8 "-0.000577707032763554"
#> 9 "-0.000569167407032334"

我写了一些代码来整理这些数据。问题是代码不是很优雅。预期产出如下。

res <- data.frame(
  stringsAsFactors = FALSE,
  wavelength = c(
    "\"800\"",
    "\"799\"",
    "\"798\"",
    "\"797\"",
    "\"796\"",
    "\"795\"",
    "\"800\"",
    "\"799\"",
    "\"798\"",
    "\"797\"",
    "\"796\"",
    "\"795\"",
    "\"800\"",
    "\"799\"",
    "\"798\"",
    "\"797\"",
    "\"796\"",
    "\"795\""
  ),
  id = c(
    "\"001\"",
    "\"001\"",
    "\"001\"",
    "\"001\"",
    "\"001\"",
    "\"001\"",
    "\"002\"",
    "\"002\"",
    "\"002\"",
    "\"002\"",
    "\"002\"",
    "\"002\"",
    "\"003\"",
    "\"003\"",
    "\"003\"",
    "\"003\"",
    "\"003\"",
    "\"003\""
  ),
  parameter = c(
    "\"ap\"",
    "\"ap\"",
    "\"ap\"",
    "\"ap\"",
    "\"ap\"",
    "\"ap\"",
    "\"ap\"",
    "\"ap\"",
    "\"ap\"",
    "\"ap\"",
    "\"ap\"",
    "\"ap\"",
    "\"ap\"",
    "\"ap\"",
    "\"ap\"",
    "\"ap\"",
    "\"ap\"",
    "\"ap\""
  ),
  year = c(
    "\"2016\"",
    "\"2016\"",
    "\"2016\"",
    "\"2016\"",
    "\"2016\"",
    "\"2016\"",
    "\"2016\"",
    "\"2016\"",
    "\"2016\"",
    "\"2016\"",
    "\"2016\"",
    "\"2016\"",
    "\"2016\"",
    "\"2016\"",
    "\"2016\"",
    "\"2016\"",
    "\"2016\"",
    "\"2016\""
  ),
  value = c(
    "\"-0.000978013465745161\"",
    "\"-0.000853574674147712\"",
    "\"-0.000770681767403833\"",
    "\"-0.000762506834996983\"",
    "\"-0.000963651463931642\"",
    "\"-0.000839241421637097\"",
    "\"-0.000583552718375254\"",
    "\"-0.000495471744663315\"",
    "\"-0.000502488351223215\"",
    "\"-0.000589039598146738\"",
    "\"-0.000599887975678647\"",
    "\"-0.000471434015603837\"",
    "\"-0.000568187733836333\"",
    "\"-0.000527059984394067\"",
    "\"-0.000615318057111717\"",
    "\"-0.000592280468265934\"",
    "\"-0.000577707032763554\"",
    "\"-0.000569167407032334\""
  )
)

res
#>    wavelength    id parameter   year                   value
#> 1       "800" "001"      "ap" "2016" "-0.000978013465745161"
#> 2       "799" "001"      "ap" "2016" "-0.000853574674147712"
#> 3       "798" "001"      "ap" "2016" "-0.000770681767403833"
#> 4       "797" "001"      "ap" "2016" "-0.000762506834996983"
#> 5       "796" "001"      "ap" "2016" "-0.000963651463931642"
#> 6       "795" "001"      "ap" "2016" "-0.000839241421637097"
#> 7       "800" "002"      "ap" "2016" "-0.000583552718375254"
#> 8       "799" "002"      "ap" "2016" "-0.000495471744663315"
#> 9       "798" "002"      "ap" "2016" "-0.000502488351223215"
#> 10      "797" "002"      "ap" "2016" "-0.000589039598146738"
#> 11      "796" "002"      "ap" "2016" "-0.000599887975678647"
#> 12      "795" "002"      "ap" "2016" "-0.000471434015603837"
#> 13      "800" "003"      "ap" "2016" "-0.000568187733836333"
#> 14      "799" "003"      "ap" "2016" "-0.000527059984394067"
#> 15      "798" "003"      "ap" "2016" "-0.000615318057111717"
#> 16      "797" "003"      "ap" "2016" "-0.000592280468265934"
#> 17      "796" "003"      "ap" "2016" "-0.000577707032763554"
#> 18      "795" "003"      "ap" "2016" "-0.000569167407032334"

如何快速收集数据(基础R,tidyr或data.table)?

2 个答案:

答案 0 :(得分:2)

方式:

library(tidyverse)

# Transpose the dataframe and remove row 1 as that is your column name
test <- t(df) %>% 
  as.tibble() %>% 
  filter(row_number() != 1)

# Name your tibble 
names(test) <- t(df)[1, ] %>% 
  str_replace_all('\\"', "")

# Make the column to numeric
test2 <- test %>% 
  mutate_all(funs(str_replace_all(., '\\"', ""))) %>% 
  mutate_at(vars(1, 4:9), funs(as.numeric(.)))

# Make the column tidy
answer <- test2 %>% 
  gather(key, value, -ID, - Parameter, -Year) %>% 
  rename(Wavelength = key) %>% 
  mutate(Wavelength = as.numeric(Wavelength))

<强>输出:

 > answer
# A tibble: 18 x 5
      ID Parameter Year  Wavelength     value
   <dbl> <chr>     <chr>      <dbl>     <dbl>
 1    1. ap        2016        800. -0.000978
 2    2. ap        2016        800. -0.000584
 3    3. ap        2016        800. -0.000568
 4    1. ap        2016        799. -0.000854
 5    2. ap        2016        799. -0.000495
 6    3. ap        2016        799. -0.000527
 7    1. ap        2016        798. -0.000771
 8    2. ap        2016        798. -0.000502
 9    3. ap        2016        798. -0.000615
10    1. ap        2016        797. -0.000763
11    2. ap        2016        797. -0.000589
12    3. ap        2016        797. -0.000592
13    1. ap        2016        796. -0.000964
14    2. ap        2016        796. -0.000600
15    3. ap        2016        796. -0.000578
16    1. ap        2016        795. -0.000839
17    2. ap        2016        795. -0.000471
18    3. ap        2016        795. -0.000569

答案 1 :(得分:1)

来自data.table的帮助您可以执行以下操作:

  melt(read.table(h=T,text=do.call(paste,transpose(df))),1:3,var="wavelength")
   ID Parameter Year wavelength         value
1   1        ap 2016       X800 -0.0009780135
2   2        ap 2016       X800 -0.0005835527
3   3        ap 2016       X800 -0.0005681877
4   1        ap 2016       X799 -0.0008535747
5   2        ap 2016       X799 -0.0004954717
6   3        ap 2016       X799 -0.0005270600
7   1        ap 2016       X798 -0.0007706818
8   2        ap 2016       X798 -0.0005024884
9   3        ap 2016       X798 -0.0006153181
10  1        ap 2016       X797 -0.0007625068
11  2        ap 2016       X797 -0.0005890396
12  3        ap 2016       X797 -0.0005922805
13  1        ap 2016       X796 -0.0009636515
14  2        ap 2016       X796 -0.0005998880
15  3        ap 2016       X796 -0.0005777070
16  1        ap 2016       X795 -0.0008392414
17  2        ap 2016       X795 -0.0004714340
18  3        ap 2016       X795 -0.0005691674