我每天跟踪与加利福尼亚州的水有关的各种信息。我之前的人是通过手动输入来自网站的数据来完成此操作的。我已经开始使用R来自动执行此过程。到目前为止,对于https://cdec.water.ca.gov/reportapp/javareports?name=RES
这样的页面使用选择器小工具进展顺利我尝试逐步遵循不同的文本挖掘教程,但仍然对此任务感到困惑。
我还尝试过将其转换为pdf并使用pdf工具,但未能实现我的目标。
任何帮助将不胜感激。
谢谢
Ethan James W
答案 0 :(得分:1)
library(httr)
library(stringi)
res <- httr::GET("https://water.ca.gov/-/media/DWR-Website/Web-Pages/Programs/State-Water-Project/Operations-And-Maintenance/Files/Operations-Control-Office/Project-Wide-Operations/Dispatchers-Monday-Water-Report.txt?la=en&hash=B8C874426999D484F7CF1E9821EE9D8C6896CF1E")
l <- stri_split_lines(content(res))[[1]]
page_breaks <- which(stri_detect_fixed(l, "SUMMARY OF SWP"))
# target page 1
page_one <- l[1:(page_breaks[2]-1)]
# find all the records on the page
recs <- paste0(page_one[stri_detect_regex(page_one, "^[[:alpha:]].*[[:digit:]]\\.")], collapse="\n")
# read it in as a fixed-width text file (b/c it really kinda is)
read.fwf(
textConnection(recs),
widths = c(10, 7, 8, 7, 7, 8, 8, 5, 7, 6, 7),
stringsAsFactors = FALSE
) -> xdf
# clean up the columns
xdf[] <- lapply(xdf, stri_trim_both)
xdf[] <- lapply(xdf, function(x) ifelse(grepl("\\.\\.|DCTOT", x), "NA", x)) # replace "....."s and the "DCTOT" string with "NA" so we can do the type conversion
xdf <- type.convert(xdf)
colnames(xdf) <- c("reservoir", "abs_max_elev", "abs_max_stor", "norm_min_elev", "norm_min_stor", "elev", "stor", "evap", "chng", "net_rel", "inflow")
xdf$reservoir <- as.character(xdf$reservoir)
哪个给了我们
xdf
## reservoir abs_max_elev abs_max_stor norm_min_elev norm_min_stor elev stor evap chng net_rel inflow
## 1 FRENCHMN 5588.0 55475 5560.00 21472 5578.67 41922 NA -53 NA NA
## 2 ANTELOPE 5002.0 22564 4990.00 12971 4994.64 16306 NA -46 NA NA
## 3 DAVIS 5775.0 84371 5760.00 35675 5770.22 66299 NA -106 NA NA
## 4 OROVILLE 901.0 3553405 640.00 852196 702.69 1275280 249 -4792 6018 1475
## 5 F/B 225.0 11768 221.00 9350 224.52 11467 NA -106 NA NA
## 6 DIV 225.0 13353 221.00 12091 224.58 13217 NA -48 NA NA
## 7 F/B+DIV 225.0 25120 221.00 21441 NA 24684 NA -154 NA NA
## 8 AFTERBAY 136.0 54906 124.00 15156 132.73 41822 NA -263 5372 NA
## 9 CLIF CT 5.0 29082 -2.00 13965 -0.72 16714 NA 194 NA 5943
## 10 BETHANY 243.5 4894 241.50 4545 243.00 4806 NA 0 NA NA
## 11 DYER 806.0 545 785.00 90 795.40 299 NA -21 NA NA
## 12 DEL VALLE 703.0 39914 678.00 24777 690.22 31514 NA -122 97 0
## 13 TEHACHAPI 3101.0 545 3097.00 388 3098.22 434 NA -25 NA NA
## 14 TEHAC EAB 3101.0 1232 3085.00 254 3096.64 941 NA -39 NA NA
## 15 QUAIL+LQC 3324.5 8612 3306.50 3564 3318.18 6551 NA -10 0 NA
## 16 PYRAMID 2578.0 169901 2560.00 147680 2574.72 165701 25 -1056 881 0
## 17 ELDRBERRY 1530.0 27681 1490.00 12228 1510.74 19470 NA 805 0 0
## 18 CASTAIC 1513.0 319247 1310.00 33482 1491.48 273616 36 -1520 1432 0
## 19 SILVRWOOD 3355.0 74970 3312.00 39211 3351.41 71511 10 276 1582 107
## 20 DC AFBY 1 1933.0 50 1922.00 18 1932.64 49 NA 0 NA NA
## 21 DC AFBY 2 1930.0 967 1904.50 198 1922.01 696 NA 37 1690 NA
## 22 CRAFTON H 2925.0 292 2905.00 70 2923.60 274 NA -2 NA NA
## 23 PERRIS 1588.0 126841 1555.30 60633 1577.96 104620 21 85 8 NA
## 24 SAN LUIS 543.0 2027835 326.00 79231 470.16 1178789 238 3273 -4099 0
## 25 O'NEILL 224.5 55076 217.50 36843 222.50 49713 NA 2325 NA NA
## 26 LOS BANOS 353.5 34562 296.00 8315 322.87 18331 NA -5 0 0
## 27 L.PANOCHE 670.4 13233 590.00 308 599.60 664 NA 0 0 0
## 28 TRINITY 2370.0 2447656 2145.00 312631 2301.44 1479281 NA -1192 NA NA
## 29 SHASTA 1067.0 4552095 828.00 502004 974.01 2300953 NA -6238 NA NA
## 30 FOLSOM 466.0 976952 327.80 84649 408.50 438744 NA -2053 NA NA
## 31 MELONES 1088.0 2420000 808.00 300000 1031.66 1779744 NA -2370 NA NA
## 32 PINE FLT 951.5 1000000 712.58 100002 771.51 231361 NA 543 508 NA
## 33 MATHEWS 1390.0 182569 1253.80 3546 1352.17 94266 NA 522 NA NA
## 34 SKINNER 1479.0 44405 1393.00 0 1476.02 38485 NA 242 NA NA
## 35 BULLARDS 1956.0 966103 1730.00 230118 1869.01 604827 NA -1310 NA NA
那很容易:-)
第2页的大部分内容都可以非常简单地完成:
page_two <- l[page_breaks[2]:length(l)]
do.call(
rbind.data.frame,
lapply(
stri_split_fixed(
stri_replace_all_regex(
stri_trim_both(page_two[stri_detect_regex(
stri_trim_both(page_two), # trim blanks
"^([^[:digit:]]+)([[:digit:]\\.]+)[[:space:]]+([^[:digit:]]+)([[:digit:]\\.]+)$" # find the release rows
)]),
"[[:space:]]{2,}", "\t" # make tab-separated fields wherever there are 2+ space breaks
), "\t"),
function(x) {
if (length(x) > 2) { # one of the lines will only have one record but most have 2
data.frame(
facility = c(x[1],x[3]),
amt = as.numeric(c(x[2], x[4])),
stringsAsFactors = FALSE
)
} else {
data.frame(
facility = x[1],
amt = as.numeric(x[2]),
stringsAsFactors = FALSE
)
}
})
) -> ydf
哪个给了我们(没有附近的无用TOTAL
行):
ydf[!grepl("TOTAL", ydf$facility),]
## facility amt
## 1 KESWICK RELEASE TO RIVER 15386.0
## 2 SHASTA STORAGE WITHDRAWAL 8067.0
## 3 SPRING CREEK RELEASE 0.0
## 4 WHISKYTOWN STORAGE WITHDRAWAL 46.0
## 6 OROVILLE STORAGE WITHDRAWL 5237.0
## 7 CDWR YUBA RIVER @ MARYSVILLE 0.0
## 8 FOLSOM STORAGE WITHDRAWAL 1386.0
## 9 LAKE OROVILLE 20.2
## 10 BYRON BETHANY I.D. 32.0
## 11 POWER CANAL 0.0
## 12 SAN LUIS TO SAN FELIPE 465.0
## 13 SUTTER BUTTE 922.0
## 14 O'NEILL FOREBAY 2.0
## 15 LATERAL 0.0
## 16 CASTAIC LAKE 1432.0
## 17 RICHVALE 589.0
## 18 SILVERWOOD LAKE TO CLAWA 7.0
## 19 WESTERN 787.0
## 20 LAKE PERRIS 0.0
## 23 D/S FEATHER R. DIVERSIONS 0.0
## 24 FISH REQUIREMENT 1230.0
## 25 FLOOD CONTROL RELEASE 0.0
## 26 DELTA REQUIREMENT 3629.0
## 27 FEATHER R. RELEASE @ RIVER OUTLET 3074.0
## 28 OTHER RELEASE 0.0
但是,如果您需要增量数据或工厂运营数据,则可以依靠自己。