使用read.csv.sql函数并加载数据的子集

时间:2019-01-17 19:46:05

标签: r

我尝试使用read.csv.sql函数读取非常大的数据集,但似乎无法加载任何数据。我在要对正在读取的数据进行子集化的地方运行以下命令,以便它读取芝加哥的所有“ Market_Names”。但是,我在列中有一些空白。基本上发生的是R试图读取csv文件并按市场名称过滤,但它加载了0个观察值。

我要去哪里错了? -我在read.csv.sql函数中加入了空格。我也可以在这里使用str_detect吗?

library(sqldf)
data2 <- read.csv.sql("c:/Data/data/mydata.csv", 
                      sql = "select * from file where Market_Name = ' CHICAGO             ' ")

数据:

structure(list(MskdName = c(" Chain55 ", " Chain55 ", " Chain55 ", 
" Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ", 
" Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ", 
" Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ", 
" Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ", 
" Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ", 
" Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ", 
" Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ", 
" Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ", 
" Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ", 
" Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ", 
" Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ", 
" Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ", 
" Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ", 
" Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ", 
" Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ", 
" Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ", 
" Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ", 
" Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ", 
" Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ", " Chain55 ", 
" Chain55 ", " Chain55 "), Market_Name = c(" CHICAGO             ", 
" CHICAGO             ", " CHICAGO             ", " CHICAGO             ", 
" CHICAGO             ", " CHICAGO             ", " CHICAGO             ", 
" CHICAGO             ", " CHICAGO             ", " CHICAGO             ", 
" CHICAGO             ", " CHICAGO             ", " CHICAGO             ", 
" CHICAGO             ", " CHICAGO             ", " CHICAGO             ", 
" CHICAGO             ", " CHICAGO             ", " CHICAGO             ", 
" CHICAGO             ", " CHICAGO             ", " CHICAGO             ", 
" CHICAGO             ", " CHICAGO             ", " CHICAGO             ", 
" CHICAGO             ", " CHICAGO             ", " CHICAGO             ", 
" CHICAGO             ", " CHICAGO             ", " CHICAGO             ", 
" CHICAGO             ", " CHICAGO             ", " CHICAGO             ", 
" CHICAGO             ", " CHICAGO             ", " CHICAGO             ", 
" CHICAGO             ", " CHICAGO             ", " CHICAGO             ", 
" CHICAGO             ", " CHICAGO             ", " CHICAGO             ", 
" CHICAGO             ", " CHICAGO             ", " CHICAGO             ", 
" CHICAGO             ", " CHICAGO             ", " CHICAGO             ", 
" CHICAGO             ", " CHICAGO             ", " CHICAGO             ", 
" CHICAGO             ", " CHICAGO             ", " CHICAGO             ", 
" CHICAGO             ", " CHICAGO             ", " CHICAGO             ", 
" CHICAGO             ", " CHICAGO             ", " CHICAGO             ", 
" CHICAGO             ", " CHICAGO             ", " CHICAGO             ", 
" CHICAGO             ", " CHICAGO             ", " CHICAGO             ", 
" CHICAGO             ", " CHICAGO             ", " CHICAGO             ", 
" CHICAGO             ", " CHICAGO             ", " CHICAGO             ", 
" CHICAGO             ", " CHICAGO             ", " CHICAGO             ", 
" CHICAGO             ", " CHICAGO             ", " CHICAGO             ", 
" CHICAGO             ", " CHICAGO             ", " CHICAGO             ", 
" CHICAGO             ", " CHICAGO             ", " CHICAGO             ", 
" CHICAGO             ", " CHICAGO             ", " CHICAGO             ", 
" CHICAGO             ", " CHICAGO             ", " CHICAGO             ", 
" CHICAGO             ", " CHICAGO             ", " CHICAGO             ", 
" CHICAGO             ", " CHICAGO             ", " CHICAGO             ", 
" CHICAGO             ", " CHICAGO             ", " CHICAGO             "
), WEEK = c(1114L, 1115L, 1116L, 1119L, 1121L, 1122L, 1123L, 
1124L, 1125L, 1128L, 1130L, 1132L, 1135L, 1141L, 1143L, 1159L, 
1160L, 1134L, 1135L, 1136L, 1115L, 1118L, 1120L, 1123L, 1125L, 
1127L, 1128L, 1132L, 1140L, 1141L, 1150L, 1155L, 1114L, 1114L, 
1114L, 1114L, 1114L, 1114L, 1114L, 1114L, 1114L, 1114L, 1114L, 
1114L, 1114L, 1114L, 1114L, 1114L, 1114L, 1114L, 1114L, 1114L, 
1114L, 1114L, 1114L, 1114L, 1114L, 1114L, 1114L, 1114L, 1114L, 
1114L, 1114L, 1114L, 1114L, 1114L, 1114L, 1114L, 1114L, 1114L, 
1114L, 1114L, 1114L, 1114L, 1114L, 1114L, 1114L, 1114L, 1114L, 
1114L, 1114L, 1114L, 1114L, 1114L, 1114L, 1114L, 1114L, 1114L, 
1114L, 1114L, 1114L, 1114L, 1114L, 1114L, 1114L, 1114L, 1114L, 
1114L, 1114L, 1114L), SY = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 7L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 7L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 7L, 7L, 7L, 7L, 7L, 7L, 0L, 0L, 0L, 
6L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 7L, 0L), GE = c(1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 
1L, 1L), DOLLARS = c(5.38, 2.69, 10.76, 10.76, 5.38, 5.38, 2.69, 
2.69, 2.69, 2.69, 2.69, 3.39, 6.78, 3.39, 3.39, 3.39, 3.39, 3.39, 
3.39, 3.39, 2.69, 2.69, 2.69, 2.69, 2.69, 5.38, 5.38, 2.69, 3.39, 
3.39, 6.78, 6.78, 16.58, 22.26, 9.98, 33.96, 247.45, 203.31, 
44.91, 11.99, 19.96, 19.47, 23.97, 462.43, 157.92, 49.9, 6.58, 
21.82, 42.77, 181.82, 13.98, 13.16, 69.93, 12.49, 6.99, 22.9, 
35.97, 10.91, 7.99, 4.99, 3.99, 29.97, 5.49, 12.87, 15.98, 8.58, 
2.49, 36.36, 13.98, 11.48, 11.98, 21.8, 122.69, 120.01, 13.62, 
22.22, 20, 54.96, 2.69, 23.96, 32.97, 17.97, 32.97, 23.96, 11.98, 
15.98, 19.96, 20.12, 20.97, 15.57, 11.82, 16.76, 4.99, 28.28, 
98.2, 43.63, 14.97, 12.98, 19.98, 17.97)), class = c("data.table", 
"data.frame"), row.names = c(NA, -100L), .internal.selfref = <pointer: 0x0000000008ec1ef0>)

编辑:

我第一次运行代码时收到此警告(此后没有警告):

Warning message:
In .Internal(putconst(constBuf, constCount, x)) :
  closing unused connection 3 (C:\Users\user\Documents\sqltest.csv)

运行以下命令之后:

dataxyz <- read.csv.sql("C:\\Users\\user\\Documents\\sqltest.csv", 
                         "select * from file where Market_Name = 'NEWYORK'", header = TRUE, sep = ",")

会议信息:

> sessionInfo()
R version 3.5.2 (2018-12-20)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows >= 8 x64 (build 9200)

Matrix products: default

locale:
[1] LC_COLLATE=Spanish_Spain.1252  LC_CTYPE=Spanish_Spain.1252    LC_MONETARY=Spanish_Spain.1252
[4] LC_NUMERIC=C                   LC_TIME=Spanish_Spain.1252    

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] sqldf_0.4-11  RSQLite_2.1.1 gsubfn_0.7    proto_1.0.0  

loaded via a namespace (and not attached):
 [1] Rcpp_1.0.0                 MASS_7.3-51.1              quantstrat_0.14.6         
 [4] FinancialInstrument_1.3.1  bit_1.1-14                 lattice_0.20-38           
 [7] quadprog_1.5-5             foreach_1.4.4              TTR_0.23-4                
[10] blob_1.1.1                 tcltk_3.5.2                tools_3.5.2               
[13] xts_0.11-2                 PerformanceAnalytics_1.5.2 quantmod_0.4-13.1         
[16] grid_3.5.2                 DBI_1.0.0                  iterators_1.0.10          
[19] digest_0.6.18              yaml_2.2.0                 bit64_0.9-7               
[22] blotter_0.14.2             codetools_0.2-15           curl_3.3                  
[25] memoise_1.1.0              compiler_3.5.2             boot_1.3-20               
[28] chron_2.3-53               pkgconfig_2.0.2            zoo_1.8-4                 
Warning messages:
1: In if (!missing(a) && mayCallBrowser(a, cntxt)) return(TRUE) :
  closing unused connection 5 (C:\Users\Matt\Documents\sqltest.csv)
2: In if (!missing(a) && mayCallBrowser(a, cntxt)) return(TRUE) :
  closing unused connection 3 (C:\Users\Matt\Documents\sqltest.csv)

这是我要读取的文件:

https://ufile.io/e6f8z

编辑2: 这些建议效果很好,谢谢!只是一个小问题。数据读入如下所示:

         X IRI_KEY WEEK SY GE  VEND ITEM UNITS DOLLARS      F D PR                 UPC    OU     EST_ACV Market_Name
1 "105983"  264929 1114  0  2 18200  495     5    6.45 "NONE" 0  0 "00-02-18200-00495" " GR" " 26.67499"    "TOLEDO"
    Open    Clsd    MskdName                                  L1                                 L2
1 " 913" " 1419" " Chain57 " "CATEGORY - BEER/ALE/ALCOHOLIC CID" "DOMESTIC BEER/ALE (INC NON-ALCOH"
                    L3                   L4        L5                           L9 VOL_EQ
1 "ANHEUSER-BUSCH INC" "ANHEUSER-BUSCH INC" "BUD ICE" "+BUDIC LAGER BEER CAN 24OZ" 0.0833
                                                                  STUBSPEC
1 "+BUDIC LAGER BEER CAN       24OZ 0 2 18200   495 1  1 0.0833RP   00335"

将几列细分为:

                  UPC     EST_ACV Market_Name
1 "00-02-18200-00495" " 26.67499"    "TOLEDO"

具有如下结构:

> peak %>%
+   select(UPC, EST_ACV, Market_Name) %>%
+ head(1) %>%
+   str()
'data.frame':   1 obs. of  3 variables:
 $ UPC        : chr "\"00-02-18200-00495\""
 $ EST_ACV    : chr "\" 26.67499\""
 $ Market_Name: chr "\"TOLEDO\""

因此我将"添加到了数据中。这不是主要问题,因为我可以使用gsub消除它们,我只是想知道是否可以在不导入数据的情况下导入数据。

0 个答案:

没有答案