从.csv中提取列数据,并将每10个连续行转换为相应的列

时间:2015-04-11 21:45:48

标签: r csv data-cleaning

以下是我尝试实施的代码。我想提取这10个连续的行值并将它们转换为相应的列。

这就是数据的样子:https://drive.google.com/file/d/0B7huoyuu0wrfeUs4d2p0eGpZSFU/view?usp=sharing

我一直在尝试,但temp1temp2出来是空的。请帮忙。

library(Hmisc)     #for increment function

myData <- read.csv("Clothing_&_Accessories.csv",header=FALSE,sep=",",fill=TRUE) # reading the csv file

extract<-myData$V2 # extracting the desired column

x<-1    
y<-1

temp1 <- NULL       #initialisation    
temp2 <- NULL       #initialisation    
data.sorted <- NULL #initialisation

limit<-nrow(myData)  # Calculating no of rows

while (x! = limit) {    
  count <- 1    
    for (count in 11) {    
      if (count > 10) {    
         inc(x) <- 1    
         break    # gets out of for loop
      }    
      else {    
         temp1[y]<-data_mat[x]  # extracting by every row element    
      }
      inc(x) <- 1  # increment x
     inc(y) <- 1  # increment y                    
   }
   temp2<-temp1
   data.sorted<-rbind(data.sorted,temp2)  # turn rows into columns 
}

2 个答案:

答案 0 :(得分:2)

您的代码太复杂了。你可以只使用一个for循环,没有外部软件包,就像这样:

myData <- as.data.frame(matrix(c(rep("a", 10), "", rep("b", 10)), ncol=1), stringsAsFactors = FALSE)

newData <- data.frame(row.names=1:10)
for (i in 1:((nrow(myData)+1)/11)) {
  start <- 11*i - 10
  newData[[paste0("col", i)]] <- myData$V1[start:(start+9)]
}

你实际上并不需要这一切。您可以简单地删除空行,将向量拆分为大小为10的块(如here所述),然后将列表转换为数据框。

vec <- myData$V1[nchar(myData$V1)>0]

as.data.frame(split(vec, ceiling(seq_along(vec)/10)))

#    X1 X2
# 1   a  b
# 2   a  b
# 3   a  b
# 4   a  b
# 5   a  b
# 6   a  b
# 7   a  b
# 8   a  b
# 9   a  b
# 10  a  b

答案 1 :(得分:2)

我们可以根据“V2”列中的''值创建数字索引,split数据集,使用Reduce/merge获取宽格式的列。

indx <- cumsum(myData$V2=='')+1
res <- Reduce(function(...) merge(..., by= 'V1'), split(myData, indx))
res1 <- res[order(factor(res$V1, levels=myData[1:10, 1])),]
colnames(res1)[-1] <- paste0('Col', 1:3)
head(res1,3)
#            V1       Col1       Col2       Col3
#2     ProductId B000179R3I B0000C3XXN B0000C3XX9
#4 product_title Amazon.com Amazon.com Amazon.com
#3 product_price    unknown    unknown    unknown

p1.png,'V1'列也可以是'V2'中值的列名。如果是这种情况,我们可以'转置'除第一列之外的'res1',并使用第一列'res1'(setNames(...)

更改输出的列名称
res2 <- setNames(as.data.frame(t(res1[-1]), stringsAsFactors=FALSE), 
                       res1[,1]) 
row.names(res2) <- NULL
res2[] <- lapply(res2, type.convert)
head(res2)
#   ProductId product_title product_price         userid
#1 B000179R3I    Amazon.com       unknown A3Q0VJTU04EZ56
#2 B0000C3XXN    Amazon.com       unknown A34JM8F992M9N1
#3 B0000C3XX9    Amazon.com       unknown A34JM8F993MN91
#                  profileName helpfulness reviewscore review_time
#1 Jeanmarie Kabala "JP Kabala"         7/7           4  1182816000
#2                   M. Shapiro         6/6           5  1205107200
#3                     J. Cruze         8/8           5   120571929
#              review_summary
#1 Periwinkle Dartmouth Blazer
#2        great classic jacket
#3                 Good jacket
#                                            review_text
#1 I own the Austin Reed dartmouth blazer in every color
#2          This is the second time I bought this jacket
#3           This is the third time I bought this jacket

我想这只是一个reshaping问题。在这种情况下,我们可以使用dcast中的data.tablelong转换为wide格式

library(data.table)
DT <- dcast(setDT(myData)[V1!=''][, N:= paste0('Col', 1:.N) ,V1], V1~N,
                              value.var='V2')

数据

 myData <- structure(list(V1 = c("ProductId", "product_title",
 "product_price", 
 "userid", "profileName", "helpfulness", "reviewscore", "review_time", 
 "review_summary", "review_text", "", "ProductId", "product_title", 
 "product_price", "userid", "profileName", "helpfulness", 
 "reviewscore", 
 "review_time", "review_summary", "review_text", "", "ProductId", 
 "product_title", "product_price", "userid", "profileName",
 "helpfulness",  
 "reviewscore", "review_time", "review_summary", "review_text"
 ), V2 = c("B000179R3I", "Amazon.com", "unknown", "A3Q0VJTU04EZ56", 
 "Jeanmarie Kabala \"JP Kabala\"", "7/7", "4", "1182816000", 
 "Periwinkle Dartmouth Blazer", 
 "I own the Austin Reed dartmouth blazer in every color", "", 
 "B0000C3XXN", "Amazon.com", "unknown", "A34JM8F992M9N1",
 "M. Shapiro", 
 "6/6", "5", "1205107200", "great classic jacket",
 "This is the second time I bought this jacket", 
 "", "B0000C3XX9", "Amazon.com", "unknown", "A34JM8F993MN91", 
 "J. Cruze", "8/8", "5", "120571929", "Good jacket",
 "This is the third time I bought this jacket"
 )), .Names = c("V1", "V2"), row.names = c(NA, 32L),
 class = "data.frame")