更确切地说,我正在构建一个箱形图来比较两个变量,即电影的年份和类型。但在我的.csv文件中,有些行包含&#34 ;;"在电影的标题中,由于我的分隔符是&#34 ;;"太。我怎么能删除那些行(其中5个在58000 ..)?
'data.frame': 58792 obs. of 31 variables:
$ title : Factor w/ 55999 levels "-30-","...4 ...3 ...2 ...1 ...morte",..: 88 89 90 91 92 93 94 33 34 35 ...
$ year : num 85 53 55 110 89 114 116 116 101 31 ...
$ length : int 121 71 7 70 71 91 93 25 97 61 ...
$ budget : int NA NA NA NA NA NA NA NA NA NA ...
$ rating : num 6.4 6 8.2 8.2 3.4 4.3 5.3 6.7 6.6 6 ...
$ votes : num 348 20 5 6 17 45 200 24 18 51 ...
$ r1 : num 4.5 0 0 14.5 24.5 4.5 4.5 4.5 4.5 4.5 ...
$ r2 : num 4.5 14.5 0 0 4.5 4.5 0 4.5 4.5 0 ...
$ r3 : num 4.5 4.5 0 0 0 4.5 4.5 4.5 4.5 4.5 ...
$ r4 : num 4.5 24.5 0 0 14.5 14.5 4.5 4.5 0 4.5 ...
$ r5 : num 14.5 14.5 0 0 14.5 14.5 24.5 4.5 0 4.5 ...
$ r6 : num 24.5 14.5 24.5 0 4.5 14.5 24.5 14.5 0 44.5 ...
$ r7 : num 24.5 14.5 0 0 0 4.5 14.5 14.5 34.5 14.5 ...
$ r8 : num 14.5 4.5 44.5 0 0 4.5 4.5 14.5 14.5 4.5 ...
$ r9 : num 4.5 4.5 24.5 34.5 0 14.5 4.5 4.5 4.5 4.5 ...
$ r10 : num 4.5 14.5 24.5 45.5 24.5 14.5 14.5 14.5 24.5 4.5 ...
$ mpaa : Factor w/ 8 levels "","0","14,5",..: 1 1 1 1 1 1 8 1 1 1 ...
$ Action : int 0 0 0 0 0 0 1 0 0 0 ...
$ Animation : int 0 0 1 0 0 0 0 0 0 0 ...
$ Comedy : int 1 1 0 1 0 0 0 0 0 0 ...
$ Drama : int 1 0 0 0 0 1 1 0 1 0 ...
$ Documentary : int 0 0 0 0 0 0 0 1 0 0 ...
$ Romance : int 0 0 0 0 0 0 0 0 0 0 ...
$ Short : int 0 0 1 0 0 0 0 1 0 0 ...
$ Action_C : chr "No" "No" "No" "No" ...
$ Animation_C : chr "No" "No" "Yes" "No" ...
$ Comedy_C : chr "Yes" "Yes" "No" "Yes" ...
$ Drama_C : chr "Yes" "No" "No" "No" ...
$ Documentary_C: chr "No" "No" "No" "No" ...
$ Romance_C : chr "No" "No" "No" "No" ...
$ Short_C : chr "No" "No" "Yes" "No" ...
答案 0 :(得分:1)
dat = data.frame(title = c("Babe", "Toy Story", "Mulan;"), rating = c(3,4,5))
> dat
title rating
1 Babe 3
2 Toy Story 4
3 Mulan; 5
dat = dat[!grepl(";", dat$title), ]
> dat
title rating
1 Babe 3
2 Toy Story 4
答案 1 :(得分:0)
## first read in as text
x <- readLines('file.csv')
## I counted 31 columns in your data--assuming that's correct, remove lines that don't
## have 30 semicolons
xClean <- x[sapply(gregexpr(';', x), function(n) length(n) == 30)]
## now read the cleaned text to a data.frame
dat <- read.table(text = xClean, sep = ';', header = TRUE)
x <- readLines('file.csv')
xSplit <- strsplit(x, ';')
xClean <- sapply(xSplit, function(s) {
paste(paste(s[0:(length(s) - 30)], collapse = ' '),
paste(s[(length(s) - 29):length(s)], collapse = ';'),
sep = ';')
dat <- read.table(text = xClean, sep = ';', header = TRUE)