我尝试使用read.csv.sql
函数读取非常大的数据集,但似乎无法加载任何数据。我在要对正在读取的数据进行子集化的地方运行以下命令,以便它读取芝加哥的所有“ Market_Names”。但是,我在列中有一些空白。基本上发生的是R试图读取csv文件并按市场名称过滤,但它加载了0个观察值。
我要去哪里错了? -我在read.csv.sql
函数中加入了空格。我也可以在这里使用str_detect
吗?
library(sqldf)
data2 <- read.csv.sql("c:/Data/data/mydata.csv",
sql = "select * from file where Market_Name = ' CHICAGO ' ")
数据:
structure(list(MskdName = c(" Chain55 ", " Chain55 ", " Chain55 ",
" Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ",
" Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ",
" Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ",
" Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ",
" Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ",
" Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ",
" Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ",
" Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ",
" Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ",
" Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ",
" Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ",
" Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ",
" Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ",
" Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ",
" Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ",
" Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ",
" Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ",
" Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ",
" Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ",
" Chain55 ", " Chain55 "), Market_Name = c(" CHICAGO ",
" CHICAGO ", " CHICAGO ", " CHICAGO ",
" CHICAGO ", " CHICAGO ", " CHICAGO ",
" CHICAGO ", " CHICAGO ", " CHICAGO ",
" CHICAGO ", " CHICAGO ", " CHICAGO ",
" CHICAGO ", " CHICAGO ", " CHICAGO ",
" CHICAGO ", " CHICAGO ", " CHICAGO ",
" CHICAGO ", " CHICAGO ", " CHICAGO ",
" CHICAGO ", " CHICAGO ", " CHICAGO ",
" CHICAGO ", " CHICAGO ", " CHICAGO ",
" CHICAGO ", " CHICAGO ", " CHICAGO ",
" CHICAGO ", " CHICAGO ", " CHICAGO ",
" CHICAGO ", " CHICAGO ", " CHICAGO ",
" CHICAGO ", " CHICAGO ", " CHICAGO ",
" CHICAGO ", " CHICAGO ", " CHICAGO ",
" CHICAGO ", " CHICAGO ", " CHICAGO ",
" CHICAGO ", " CHICAGO ", " CHICAGO ",
" CHICAGO ", " CHICAGO ", " CHICAGO ",
" CHICAGO ", " CHICAGO ", " CHICAGO ",
" CHICAGO ", " CHICAGO ", " CHICAGO ",
" CHICAGO ", " CHICAGO ", " CHICAGO ",
" CHICAGO ", " CHICAGO ", " CHICAGO ",
" CHICAGO ", " CHICAGO ", " CHICAGO ",
" CHICAGO ", " CHICAGO ", " CHICAGO ",
" CHICAGO ", " CHICAGO ", " CHICAGO ",
" CHICAGO ", " CHICAGO ", " CHICAGO ",
" CHICAGO ", " CHICAGO ", " CHICAGO ",
" CHICAGO ", " CHICAGO ", " CHICAGO ",
" CHICAGO ", " CHICAGO ", " CHICAGO ",
" CHICAGO ", " CHICAGO ", " CHICAGO ",
" CHICAGO ", " CHICAGO ", " CHICAGO ",
" CHICAGO ", " CHICAGO ", " CHICAGO ",
" CHICAGO ", " CHICAGO ", " CHICAGO ",
" CHICAGO ", " CHICAGO ", " CHICAGO "
), WEEK = c(1114L, 1115L, 1116L, 1119L, 1121L, 1122L, 1123L,
1124L, 1125L, 1128L, 1130L, 1132L, 1135L, 1141L, 1143L, 1159L,
1160L, 1134L, 1135L, 1136L, 1115L, 1118L, 1120L, 1123L, 1125L,
1127L, 1128L, 1132L, 1140L, 1141L, 1150L, 1155L, 1114L, 1114L,
1114L, 1114L, 1114L, 1114L, 1114L, 1114L, 1114L, 1114L, 1114L,
1114L, 1114L, 1114L, 1114L, 1114L, 1114L, 1114L, 1114L, 1114L,
1114L, 1114L, 1114L, 1114L, 1114L, 1114L, 1114L, 1114L, 1114L,
1114L, 1114L, 1114L, 1114L, 1114L, 1114L, 1114L, 1114L, 1114L,
1114L, 1114L, 1114L, 1114L, 1114L, 1114L, 1114L, 1114L, 1114L,
1114L, 1114L, 1114L, 1114L, 1114L, 1114L, 1114L, 1114L, 1114L,
1114L, 1114L, 1114L, 1114L, 1114L, 1114L, 1114L, 1114L, 1114L,
1114L, 1114L, 1114L), SY = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 7L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 7L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 7L, 7L, 7L, 7L, 7L, 7L, 0L, 0L, 0L,
6L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 7L, 0L), GE = c(1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L,
1L, 1L), DOLLARS = c(5.38, 2.69, 10.76, 10.76, 5.38, 5.38, 2.69,
2.69, 2.69, 2.69, 2.69, 3.39, 6.78, 3.39, 3.39, 3.39, 3.39, 3.39,
3.39, 3.39, 2.69, 2.69, 2.69, 2.69, 2.69, 5.38, 5.38, 2.69, 3.39,
3.39, 6.78, 6.78, 16.58, 22.26, 9.98, 33.96, 247.45, 203.31,
44.91, 11.99, 19.96, 19.47, 23.97, 462.43, 157.92, 49.9, 6.58,
21.82, 42.77, 181.82, 13.98, 13.16, 69.93, 12.49, 6.99, 22.9,
35.97, 10.91, 7.99, 4.99, 3.99, 29.97, 5.49, 12.87, 15.98, 8.58,
2.49, 36.36, 13.98, 11.48, 11.98, 21.8, 122.69, 120.01, 13.62,
22.22, 20, 54.96, 2.69, 23.96, 32.97, 17.97, 32.97, 23.96, 11.98,
15.98, 19.96, 20.12, 20.97, 15.57, 11.82, 16.76, 4.99, 28.28,
98.2, 43.63, 14.97, 12.98, 19.98, 17.97)), class = c("data.table",
"data.frame"), row.names = c(NA, -100L), .internal.selfref = <pointer: 0x0000000008ec1ef0>)
编辑:
我第一次运行代码时收到此警告(此后没有警告):
Warning message:
In .Internal(putconst(constBuf, constCount, x)) :
closing unused connection 3 (C:\Users\user\Documents\sqltest.csv)
运行以下命令之后:
dataxyz <- read.csv.sql("C:\\Users\\user\\Documents\\sqltest.csv",
"select * from file where Market_Name = 'NEWYORK'", header = TRUE, sep = ",")
会议信息:
> sessionInfo()
R version 3.5.2 (2018-12-20)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows >= 8 x64 (build 9200)
Matrix products: default
locale:
[1] LC_COLLATE=Spanish_Spain.1252 LC_CTYPE=Spanish_Spain.1252 LC_MONETARY=Spanish_Spain.1252
[4] LC_NUMERIC=C LC_TIME=Spanish_Spain.1252
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] sqldf_0.4-11 RSQLite_2.1.1 gsubfn_0.7 proto_1.0.0
loaded via a namespace (and not attached):
[1] Rcpp_1.0.0 MASS_7.3-51.1 quantstrat_0.14.6
[4] FinancialInstrument_1.3.1 bit_1.1-14 lattice_0.20-38
[7] quadprog_1.5-5 foreach_1.4.4 TTR_0.23-4
[10] blob_1.1.1 tcltk_3.5.2 tools_3.5.2
[13] xts_0.11-2 PerformanceAnalytics_1.5.2 quantmod_0.4-13.1
[16] grid_3.5.2 DBI_1.0.0 iterators_1.0.10
[19] digest_0.6.18 yaml_2.2.0 bit64_0.9-7
[22] blotter_0.14.2 codetools_0.2-15 curl_3.3
[25] memoise_1.1.0 compiler_3.5.2 boot_1.3-20
[28] chron_2.3-53 pkgconfig_2.0.2 zoo_1.8-4
Warning messages:
1: In if (!missing(a) && mayCallBrowser(a, cntxt)) return(TRUE) :
closing unused connection 5 (C:\Users\Matt\Documents\sqltest.csv)
2: In if (!missing(a) && mayCallBrowser(a, cntxt)) return(TRUE) :
closing unused connection 3 (C:\Users\Matt\Documents\sqltest.csv)
这是我要读取的文件:
编辑2: 这些建议效果很好,谢谢!只是一个小问题。数据读入如下所示:
X IRI_KEY WEEK SY GE VEND ITEM UNITS DOLLARS F D PR UPC OU EST_ACV Market_Name
1 "105983" 264929 1114 0 2 18200 495 5 6.45 "NONE" 0 0 "00-02-18200-00495" " GR" " 26.67499" "TOLEDO"
Open Clsd MskdName L1 L2
1 " 913" " 1419" " Chain57 " "CATEGORY - BEER/ALE/ALCOHOLIC CID" "DOMESTIC BEER/ALE (INC NON-ALCOH"
L3 L4 L5 L9 VOL_EQ
1 "ANHEUSER-BUSCH INC" "ANHEUSER-BUSCH INC" "BUD ICE" "+BUDIC LAGER BEER CAN 24OZ" 0.0833
STUBSPEC
1 "+BUDIC LAGER BEER CAN 24OZ 0 2 18200 495 1 1 0.0833RP 00335"
将几列细分为:
UPC EST_ACV Market_Name
1 "00-02-18200-00495" " 26.67499" "TOLEDO"
具有如下结构:
> peak %>%
+ select(UPC, EST_ACV, Market_Name) %>%
+ head(1) %>%
+ str()
'data.frame': 1 obs. of 3 variables:
$ UPC : chr "\"00-02-18200-00495\""
$ EST_ACV : chr "\" 26.67499\""
$ Market_Name: chr "\"TOLEDO\""
因此我将"
添加到了数据中。这不是主要问题,因为我可以使用gsub消除它们,我只是想知道是否可以在不导入数据的情况下导入数据。