将文本/ pdf文件中的信息收集到R

时间:2018-10-19 18:24:09

标签: r pdf text web-scraping mining

我每天跟踪与加利福尼亚州的水有关的各种信息。我之前的人是通过手动输入来自网站的数据来完成此操作的。我已经开始使用R来自动执行此过程。到目前为止,对于https://cdec.water.ca.gov/reportapp/javareports?name=RES

这样的页面使用选择器小工具进展顺利

但是,由于该报告全文,因此我对此报告感到麻烦: https://water.ca.gov/-/media/DWR-Website/Web-Pages/Programs/State-Water-Project/Operations-And-Maintenance/Files/Operations-Control-Office/Project-Wide-Operations/Dispatchers-Monday-Water-Report.txt?la=en&hash=B8C874426999D484F7CF1E9821EE9D8C6896CF1E

我尝试逐步遵循不同的文本挖掘教程,但仍然对此任务感到困惑。

我还尝试过将其转换为pdf并使用pdf工具,但未能实现我的目标。

任何帮助将不胜感激。

谢谢

Ethan James W

1 个答案:

答案 0 :(得分:1)

library(httr)
library(stringi)

res <- httr::GET("https://water.ca.gov/-/media/DWR-Website/Web-Pages/Programs/State-Water-Project/Operations-And-Maintenance/Files/Operations-Control-Office/Project-Wide-Operations/Dispatchers-Monday-Water-Report.txt?la=en&hash=B8C874426999D484F7CF1E9821EE9D8C6896CF1E")

l <- stri_split_lines(content(res))[[1]]

page_breaks <- which(stri_detect_fixed(l, "SUMMARY OF SWP"))

# target page 1
page_one <- l[1:(page_breaks[2]-1)]

# find all the records on the page
recs <- paste0(page_one[stri_detect_regex(page_one, "^[[:alpha:]].*[[:digit:]]\\.")], collapse="\n")

# read it in as a fixed-width text file (b/c it really kinda is)
read.fwf(
  textConnection(recs),
  widths = c(10, 7, 8, 7, 7, 8, 8, 5, 7, 6, 7),
  stringsAsFactors = FALSE
) -> xdf

# clean up the columns
xdf[] <- lapply(xdf, stri_trim_both)
xdf[] <- lapply(xdf, function(x) ifelse(grepl("\\.\\.|DCTOT", x), "NA", x)) # replace "....."s and the "DCTOT" string with "NA" so we can do the type conversion
xdf <- type.convert(xdf)
colnames(xdf) <- c("reservoir", "abs_max_elev", "abs_max_stor", "norm_min_elev", "norm_min_stor", "elev", "stor", "evap", "chng", "net_rel", "inflow")
xdf$reservoir <- as.character(xdf$reservoir)

哪个给了我们

xdf
##    reservoir abs_max_elev abs_max_stor norm_min_elev norm_min_stor    elev    stor evap  chng net_rel inflow
## 1   FRENCHMN       5588.0        55475       5560.00         21472 5578.67   41922   NA   -53      NA     NA
## 2   ANTELOPE       5002.0        22564       4990.00         12971 4994.64   16306   NA   -46      NA     NA
## 3      DAVIS       5775.0        84371       5760.00         35675 5770.22   66299   NA  -106      NA     NA
## 4   OROVILLE        901.0      3553405        640.00        852196  702.69 1275280  249 -4792    6018   1475
## 5        F/B        225.0        11768        221.00          9350  224.52   11467   NA  -106      NA     NA
## 6        DIV        225.0        13353        221.00         12091  224.58   13217   NA   -48      NA     NA
## 7    F/B+DIV        225.0        25120        221.00         21441      NA   24684   NA  -154      NA     NA
## 8   AFTERBAY        136.0        54906        124.00         15156  132.73   41822   NA  -263    5372     NA
## 9    CLIF CT          5.0        29082         -2.00         13965   -0.72   16714   NA   194      NA   5943
## 10   BETHANY        243.5         4894        241.50          4545  243.00    4806   NA     0      NA     NA
## 11      DYER        806.0          545        785.00            90  795.40     299   NA   -21      NA     NA
## 12 DEL VALLE        703.0        39914        678.00         24777  690.22   31514   NA  -122      97      0
## 13 TEHACHAPI       3101.0          545       3097.00           388 3098.22     434   NA   -25      NA     NA
## 14 TEHAC EAB       3101.0         1232       3085.00           254 3096.64     941   NA   -39      NA     NA
## 15 QUAIL+LQC       3324.5         8612       3306.50          3564 3318.18    6551   NA   -10       0     NA
## 16   PYRAMID       2578.0       169901       2560.00        147680 2574.72  165701   25 -1056     881      0
## 17 ELDRBERRY       1530.0        27681       1490.00         12228 1510.74   19470   NA   805       0      0
## 18   CASTAIC       1513.0       319247       1310.00         33482 1491.48  273616   36 -1520    1432      0
## 19 SILVRWOOD       3355.0        74970       3312.00         39211 3351.41   71511   10   276    1582    107
## 20 DC AFBY 1       1933.0           50       1922.00            18 1932.64      49   NA     0      NA     NA
## 21 DC AFBY 2       1930.0          967       1904.50           198 1922.01     696   NA    37    1690     NA
## 22 CRAFTON H       2925.0          292       2905.00            70 2923.60     274   NA    -2      NA     NA
## 23    PERRIS       1588.0       126841       1555.30         60633 1577.96  104620   21    85       8     NA
## 24  SAN LUIS        543.0      2027835        326.00         79231  470.16 1178789  238  3273   -4099      0
## 25   O'NEILL        224.5        55076        217.50         36843  222.50   49713   NA  2325      NA     NA
## 26 LOS BANOS        353.5        34562        296.00          8315  322.87   18331   NA    -5       0      0
## 27 L.PANOCHE        670.4        13233        590.00           308  599.60     664   NA     0       0      0
## 28   TRINITY       2370.0      2447656       2145.00        312631 2301.44 1479281   NA -1192      NA     NA
## 29    SHASTA       1067.0      4552095        828.00        502004  974.01 2300953   NA -6238      NA     NA
## 30    FOLSOM        466.0       976952        327.80         84649  408.50  438744   NA -2053      NA     NA
## 31   MELONES       1088.0      2420000        808.00        300000 1031.66 1779744   NA -2370      NA     NA
## 32  PINE FLT        951.5      1000000        712.58        100002  771.51  231361   NA   543     508     NA
## 33   MATHEWS       1390.0       182569       1253.80          3546 1352.17   94266   NA   522      NA     NA
## 34   SKINNER       1479.0        44405       1393.00             0 1476.02   38485   NA   242      NA     NA
## 35  BULLARDS       1956.0       966103       1730.00        230118 1869.01  604827   NA -1310      NA     NA

那很容易:-)

第2页的大部分内容都可以非常简单地完成:

page_two <- l[page_breaks[2]:length(l)]

do.call(
  rbind.data.frame,
  lapply(
    stri_split_fixed(
      stri_replace_all_regex(
        stri_trim_both(page_two[stri_detect_regex(
          stri_trim_both(page_two), # trim blanks
          "^([^[:digit:]]+)([[:digit:]\\.]+)[[:space:]]+([^[:digit:]]+)([[:digit:]\\.]+)$" # find the release rows
        )]),
        "[[:space:]]{2,}", "\t" # make tab-separated fields wherever there are 2+ space breaks
      ), "\t"),
    function(x) {
      if (length(x) > 2) { # one of the lines will only have one record but most have 2
        data.frame(
          facility = c(x[1],x[3]),
          amt = as.numeric(c(x[2], x[4])),
          stringsAsFactors = FALSE
        )
      } else {
        data.frame(
          facility = x[1],
          amt = as.numeric(x[2]),
          stringsAsFactors = FALSE
        )
      }
    })
) -> ydf

哪个给了我们(没有附近的无用TOTAL行):

ydf[!grepl("TOTAL", ydf$facility),]
##                             facility     amt
## 1           KESWICK RELEASE TO RIVER 15386.0
## 2          SHASTA STORAGE WITHDRAWAL  8067.0
## 3               SPRING CREEK RELEASE     0.0
## 4      WHISKYTOWN STORAGE WITHDRAWAL    46.0
## 6         OROVILLE STORAGE WITHDRAWL  5237.0
## 7       CDWR YUBA RIVER @ MARYSVILLE     0.0
## 8          FOLSOM STORAGE WITHDRAWAL  1386.0
## 9                      LAKE OROVILLE    20.2
## 10                BYRON BETHANY I.D.    32.0
## 11                       POWER CANAL     0.0
## 12            SAN LUIS TO SAN FELIPE   465.0
## 13                      SUTTER BUTTE   922.0
## 14                   O'NEILL FOREBAY     2.0
## 15                           LATERAL     0.0
## 16                      CASTAIC LAKE  1432.0
## 17                          RICHVALE   589.0
## 18          SILVERWOOD LAKE TO CLAWA     7.0
## 19                           WESTERN   787.0
## 20                       LAKE PERRIS     0.0
## 23         D/S FEATHER R. DIVERSIONS     0.0
## 24                  FISH REQUIREMENT  1230.0
## 25             FLOOD CONTROL RELEASE     0.0
## 26                 DELTA REQUIREMENT  3629.0
## 27 FEATHER R. RELEASE @ RIVER OUTLET  3074.0
## 28                     OTHER RELEASE     0.0

但是,如果您需要增量数据或工厂运营数据,则可以依靠自己。