我需要找到这个值的平均值 类(newtemp) 是的。这个数据是原始的10.6°C。我删除了°C但空间是因为as.numeric不起作用。
newtemp
[1] "-10.6 " "-10.6 " "-11.0 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-10.6 "
[12] "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-11.0 "
[23] "-10.6 " "-10.6 " "-10.6 " "-11.0 " "-10.6 " "-10.0 " "-10.0 " "-10.0 " "-10.0 " "-10.0 " "-9.4 "
[34] "-8.9 " "-8.3 " "-7.2 " "-7.2 " "-5.6 " "-5.0 " "-3.9 " "-3.9 " "-3.9 " "-3.3 " "-3.3 "
[45] "-3.9 " "-6.1 " "-8.3 " "-7.8 " "-8.9 " "-10.0 " "-11.7 " "-12.8 "
#Tried this
library(stringr)
try=str_replace_all(newtemp, fixed(" "), "") but not able to remove
#Tried this also
trim <- function (x) gsub("^\\s+|\\s+$", "", x)
trim(x =newtemp)
# STill not removed
as.numeric(try)
[1] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
[36] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
#Warning message:
#NAs introduced by coercion
#Sill no output.
我使用的代码是:
library(rvest)
linkurl="https://www.wunderground.com/history/airport/KVAY/2015/2/17/DailyHistory.html?req_city=Cherry+Hill&req_state=NJ&req_statename=New+Jersey&reqdb.zip=08002&reqdb.magic=1&reqdb.wmo=99999&MR=1"
weathertable=read_html(linkurl)%>%html_node("#obsTable")%>%html_table()
weathertable
newtemp=weathertable$Temp.
abc=(gsub("°C", "", newtemp))
abc
abc_new=(gsub("[[:space:]]", "", abc))
as.numeric(abc_new)
trimws(x=abc)
as.numeric(trimws(x=abc))
更新
> newtemp
[1] "-10.6 °C" "-10.6 °C" "-11.0 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C"
[10] "-10.6 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C"
[19] "-10.6 °C" "-10.6 °C" "-10.6 °C" "-11.0 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C" "-11.0 °C" "-10.6 °C"
[28] "-10.0 °C" "-10.0 °C" "-10.0 °C" "-10.0 °C" "-10.0 °C" "-9.4 °C" "-8.9 °C" "-8.3 °C" "-7.2 °C"
[37] "-7.2 °C" "-5.6 °C" "-5.0 °C" "-3.9 °C" "-3.9 °C" "-3.9 °C" "-3.3 °C" "-3.3 °C" "-3.9 °C"
[46] "-6.1 °C" "-8.3 °C" "-7.8 °C" "-8.9 °C" "-10.0 °C" "-11.7 °C" "-12.8 °C"
> abc=(gsub(" °C", "", newtemp))
> abc
[1] "-10.6 °C" "-10.6 °C" "-11.0 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C"
[10] "-10.6 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C"
[19] "-10.6 °C" "-10.6 °C" "-10.6 °C" "-11.0 °C" "-10.6 °C" "-10.6 °C" "-10.6 °C" "-11.0 °C" "-10.6 °C"
[28] "-10.0 °C" "-10.0 °C" "-10.0 °C" "-10.0 °C" "-10.0 °C" "-9.4 °C" "-8.9 °C" "-8.3 °C" "-7.2 °C"
[37] "-7.2 °C" "-5.6 °C" "-5.0 °C" "-3.9 °C" "-3.9 °C" "-3.9 °C" "-3.3 °C" "-3.3 °C" "-3.9 °C"
[46] "-6.1 °C" "-8.3 °C" "-7.8 °C" "-8.9 °C" "-10.0 °C" "-11.7 °C" "-12.8 °C"
> abc=(gsub("°C", "", newtemp))
> abc
[1] "-10.6 " "-10.6 " "-11.0 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-10.6 "
[12] "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-10.6 " "-11.0 "
[23] "-10.6 " "-10.6 " "-10.6 " "-11.0 " "-10.6 " "-10.0 " "-10.0 " "-10.0 " "-10.0 " "-10.0 " "-9.4 "
[34] "-8.9 " "-8.3 " "-7.2 " "-7.2 " "-5.6 " "-5.0 " "-3.9 " "-3.9 " "-3.9 " "-3.3 " "-3.3 "
[45] "-3.9 " "-6.1 " "-8.3 " "-7.8 " "-8.9 " "-10.0 " "-11.7 " "-12.8 "
>
答案 0 :(得分:2)
您可以使用函数trimws
。
> x <- "-10.6 "
> trimws(x)
[1] "-10.6"
> as.numeric(trimws(x))
[1] -10.6
更新
这似乎适用于您的情况。
abc <- gsub("(^[-]\\d+\\.\\d+)(.*$)", "\\1", newtemp)
data.frame(new = abc, old = newtemp)
new old
1 -10.6 -10.6 °C
2 -10.6 -10.6 °C
3 -11.0 -11.0 °C
4 -10.6 -10.6 °C
5 -10.6 -10.6 °C
6 -10.6 -10.6 °C
7 -10.6 -10.6 °C
8 -10.6 -10.6 °C
9 -10.6 -10.6 °C
10 -10.6 -10.6 °C
11 -10.6 -10.6 °C
12 -10.6 -10.6 °C
13 -10.6 -10.6 °C
14 -10.6 -10.6 °C
...
某些事情正在发生,人物和#34;转换&#34;这是我的头脑,所以我无法解释出现了什么问题。我解决问题的方法是忽略看似冒犯的°C
部分,只使用regular expression提取数字部分。在坚果壳中,我使用特殊字符来查找减号([ - ]查找减号或没有),数字(\ d +查找一行中的所有数字),点(\。)和句子结尾($),放组中的所有内容(使用()
)然后提取第一组,留下其他所有内容&#34;在&#34;后面。
这里的原始角色
> charToRaw(newtemp[1])
[1] 2d 31 30 2e 36 c2 a0 c2 b0 43
如果我将其复制/粘贴到R
中> charToRaw("-10.6 °C")
[1] 2d 31 30 2e 36 20 b0 43
也许拥有更多计算机技能的人可以参与其中。
答案 1 :(得分:2)
您的字符串包含除常规ASCII空间(十进制值32)以外的空格。因此,您需要一个匹配任何Unicode空格的正则表达式。奇怪的是,简单的gsub("[[:space:]]*°C", "", newtemp)
在所有R环境中都不起作用。
通常有效的是PCRE正则表达式:
gsub("(*UCP)\\s*°C", "", newtemp, perl=TRUE)
这里,(*UCP)
是一个PCRE动词,使速记字符类可识别Unicode,\s
可以匹配任何Unicode空格。 perl=TRUE
参数使R使用PCRE正则表达式引擎而不是默认的TRE正则表达式引擎。
答案 2 :(得分:1)
您可以使用readr::parse_number
来避免整个问题:
library(rvest)
library(tidyverse)
url <- "https://www.wunderground.com/history/airport/KVAY/2015/2/17/DailyHistory.html?req_city=Cherry+Hill&req_state=NJ&req_statename=New+Jersey&reqdb.zip=08002&reqdb.magic=1&reqdb.wmo=99999&MR=1"
h <- url %>% read_html()
obs <- h %>%
html_node('#obsTable') %>%
html_table()
obs_clean <- obs %>%
mutate(Temp. = parse_number(Temp.)) %>%
tbl_df() # for printing
obs_clean
#> # A tibble: 52 x 13
#> `Time (EST)` Temp. Windchill `Dew Point` Humidity Pressure Visibility
#> <chr> <dbl> <chr> <chr> <chr> <chr> <chr>
#> 1 12:33 AM 12.9 - 8.1 °F 81% 30.08 in 2.0 mi
#> 2 12:45 AM 12.9 - 8.1 °F 81% 30.05 in 1.2 mi
#> 3 12:51 AM 12.2 - 8.6 °F 85% 30.05 in 0.8 mi
#> 4 12:54 AM 12.9 - 9.0 °F 84% 30.06 in 0.8 mi
#> 5 1:02 AM 12.9 - 9.0 °F 84% 30.05 in 0.5 mi
#> 6 1:25 AM 12.9 - 9.0 °F 84% 30.02 in 1.0 mi
#> 7 1:37 AM 12.9 6.9 °F 9.0 °F 84% 30.03 in 0.8 mi
#> 8 1:54 AM 12.9 6.9 °F 9.0 °F 84% 30.02 in 0.8 mi
#> 9 2:18 AM 12.9 5.2 °F 9.0 °F 84% 30.01 in 1.0 mi
#> 10 2:40 AM 12.9 5.2 °F 9.0 °F 84% 29.99 in 1.0 mi
#> # ... with 42 more rows, and 6 more variables: `Wind Dir` <chr>, `Wind
#> # Speed` <chr>, `Gust Speed` <chr>, Precip <chr>, Events <chr>,
#> # Conditions <chr>
或使用正则表达式,
obs %>% mutate(Temp. = as.numeric(gsub('[\\W°FC]', '', Temp.))) %>% tbl_df()
返回同样的东西。