我已阅读大部分有关此类错误的帖子,但均不适用于我的情况。我是R的新人,根据Nolan和Lang的书中的数据科学案例研究在R学校完成作业。我正在使用统计数据识别垃圾邮件,可以找到代码的网址{{3} },需要处理来自here的文件(非常大)
现在我遇到的问题如下(只是发布了我遇到问题的代码块):
sampleSplit = lapply(sampleEmail, splitMessage)
processHeader = function(header)
{
# modify the first line to create a key:value pair
header[1] = sub("^From", "Top-From:", header[1])
headerMat = read.dcf(textConnection(header), all = TRUE)
headerVec = unlist(headerMat)
dupKeys = sapply(headerMat, function(x) length(unlist(x)))
names(headerVec) = rep(colnames(headerMat), dupKeys)
return(headerVec)
}
headerList = lapply(sampleSplit,
function(msg) {
processHeader(msg$header)} )
contentTypes = sapply(headerList, function(header)
header["Content-Type"])
names(contentTypes) = NULL
contentTypes
hasAttach = grep("^ *multi", tolower(contentTypes))
hasAttach
boundaries = getBoundary(contentTypes[ hasAttach ])
boundaries
boundary = boundaries[9]
body = sampleSplit[[15]]$body
bString = paste("--", boundary, sep = "")
bStringLocs = which(bString == body)
bStringLocs
eString = paste("--", boundary, "--", sep = "")
eStringLoc = which(eString == body)
eStringLoc
diff(c(bStringLocs[-1], eStringLoc))
### This code has mistakes in it - and we fix them later!
processAttach = function(body, contentType){
boundary = getBoundary(contentType)
bString = paste("--", boundary, "$", sep = "")
bStringLocs = grep(bString, body)
eString = paste("--", boundary, "--$", sep = "")
eStringLoc = grep(eString, body)
n = length(body)
if (length(eStringLoc) == 0) eStringLoc = n + 1
if (length(bStringLocs) == 1) attachLocs = NULL
else attachLocs = c(bStringLocs[-1], eStringLoc)
msg = body[ (bStringLocs[1] + 1) : min(n, (bStringLocs[2] - 1),
na.rm = TRUE)]
if ( eStringLoc < n )
msg = c(msg, body[ (eStringLoc + 1) : n ])
if ( !is.null(attachLocs) ) {
attachLens = diff(attachLocs, lag = 1)
attachTypes = mapply(function(begL, endL) {
contentTypeLoc = grep("[Cc]ontent-[Tt]ype", body[ (begL + 1) : (endL - 1)])
contentType = body[ begL + contentTypeLoc]
contentType = gsub('"', "", contentType )
MIMEType = sub(" *Content-Type: *([^;]*);?.*", "\\1", contentType)
return(MIMEType)
}, attachLocs[-length(attachLocs)], attachLocs[-1])
}
if (is.null(attachLocs)) return(list(body = msg, attachInfo = NULL) )
else return(list(body = msg,
attachDF = data.frame(aLen = attachLens,
aType = attachTypes,
stringsAsFactors = FALSE)))
}
bodyList = lapply(sampleSplit, function(msg) msg$body)
attList = mapply(processAttach, bodyList[hasAttach],
contentTypes[hasAttach],
SIMPLIFY = FALSE)
lens = sapply(attList, function(processedA)
processedA$attachDF$aLen)
head(lens)
attList[[2]]$attachDF
body = bodyList[hasAttach][[2]]
length(body)
body[35:45]
processAttach = function(body, contentType){
n = length(body)
boundary = getBoundary(contentType)
bString = paste("--", boundary, sep = "")
bStringLocs = which(bString == body)
eString = paste("--", boundary, "--", sep = "")
eStringLoc = which(eString == body)
if (length(eStringLoc) == 0) eStringLoc = n
if (length(bStringLocs) <= 1) {
attachLocs = NULL
msgLastLine = n
if (length(bStringLocs) == 0) bStringLocs = 0
} else {
attachLocs = c(bStringLocs[ -1 ], eStringLoc)
msgLastLine = bStringLocs[2] - 1
}
msg = body[ (bStringLocs[1] + 1) : msgLastLine]
if ( eStringLoc < n )
msg = c(msg, body[ (eStringLoc + 1) : n ])
if ( !is.null(attachLocs) ) {
attachLens = diff(attachLocs, lag = 1)
attachTypes = mapply(function(begL, endL) {
CTloc = grep("^[Cc]ontent-[Tt]ype", body[ (begL + 1) : (endL - 1)])
if ( length(CTloc) == 0 ) {
MIMEType = NA
} else {
CTval = body[ begL + CTloc[1] ]
CTval = gsub('"', "", CTval )
MIMEType = sub(" *[Cc]ontent-[Tt]ype: *([^;]*);?.*", "\\1", CTval)
}
return(MIMEType)
}, attachLocs[-length(attachLocs)], attachLocs[-1])
}
if (is.null(attachLocs)) return(list(body = msg, attachDF = NULL) )
return(list(body = msg,
attachDF = data.frame(aLen = attachLens,
aType = unlist(attachTypes),
stringsAsFactors = FALSE)))
}
readEmail = function(dirName) {
# retrieve the names of files in directory
fileNames = list.files(dirName, full.names = TRUE)
# drop files that are not email
notEmail = grep("cmds$", fileNames)
if ( length(notEmail) > 0) fileNames = fileNames[ - notEmail ]
# read all files in the directory
lapply(fileNames, readLines, encoding = "latin1")
}
processAllEmail = function(dirName, isSpam = FALSE)
{
# read all files in the directory
messages = readEmail(dirName)
fileNames = names(messages)
n = length(messages)
# split header from body
eSplit = lapply(messages, splitMessage)
rm(messages)
# process header as named character vector
headerList = lapply(eSplit, function(msg)
processHeader(msg$header))
# extract content-type key
contentTypes = sapply(headerList, function(header)
header["Content-Type"])
# extract the body
bodyList = lapply(eSplit, function(msg) msg$body)
rm(eSplit)
# which email have attachments
hasAttach = grep("^ *multi", tolower(contentTypes))
# get summary stats for attachments and the shorter body
attList = mapply(processAttach, bodyList[hasAttach],
contentTypes[hasAttach], SIMPLIFY = FALSE)
bodyList[hasAttach] = lapply(attList, function(attEl)
attEl$body)
attachInfo = vector("list", length = n )
attachInfo[ hasAttach ] = lapply(attList,
function(attEl) attEl$attachDF)
# prepare return structure
emailList = mapply(function(header, body, attach, isSpam) {
list(isSpam = isSpam, header = header,
body = body, attach = attach)
},
headerList, bodyList, attachInfo,
rep(isSpam, n), SIMPLIFY = FALSE )
names(emailList) = fileNames
invisible(emailList)
}
一切正常,直到:
emailStruct = mapply(processAllEmail, fullDirNames,
isSpam = rep( c(FALSE, TRUE), 3:2))
emailStruct = unlist(emailStruct, recursive = FALSE)
sampleStruct = emailStruct[ indx ]
save(emailStruct, file="emailXX.rda")
我在textConnection(标题)中收到错误错误:所有连接都在使用中,因此它不会识别&#34; emailStruct&#34;,我稍后需要它。我真的不知道如何克服它,以便我可以继续使用其余的代码,这需要其中一些变量才能工作。
答案 0 :(得分:0)
当您运行textConnection()
时,您正在打开文本连接,但您永远不会关闭它。在阅读之后尝试明确地关闭它
read.dcf(tc<-textConnection(header), all = TRUE)
close(tc)