从Gmail下载子文件夹

时间:2012-04-22 09:24:18

标签: python r gmail web-scraping

我在评论中this blog找到的代码存在问题,作为该帖子中提供的代码扩展。

mailSoc <- function(login,
                    pass,
                    serv = "imap.gmail.com", #specify IMAP server
                    ntore = 50, #ignore if addressed to more than
                    todow = -1, #how many to download
                    begin = -1, #from which to start
                    folder = ''){ #folder to download (default:inbox)

  #load rJython and Python libraries
  require(rJython)
  rJython <- rJython(modules = "imaplib")
  rJython$exec("import imaplib")

  #connect to server
  rJython$exec(paste("mymail = imaplib.IMAP4_SSL('",
                     serv, "')", sep = ""))
  rJython$exec(paste("mymail.login(\'",
                     login, "\',\'",
                     pass, "\')", sep = ""))

  #get number of available messages
  rJython$exec(paste("sel = mymail.select(\"", folder,"\")", sep=""))
  rJython$exec("number = sel[1]")
  nofmsg <- .jstrVal(rJython$get("number"))
  nofmsg <- as.numeric(unlist(strsplit(nofmsg, "'"))[2])

  #if 'begin' not specified begin from the newest
  if(begin == -1)
  {
    begin <- nofmsg
  }

  #if 'todow' not specified download all
  if(todow == -1)
  {
    end <- 1
  }
  else
  {
    end <- begin - todow
  }

  #give a little bit of information
  todownload <- begin - end
  print(paste("Found", nofmsg, "emails"))
  print(paste("I will download", todownload, "messages."))
  print("It can take a while")

  data <- data.frame()

  #fetching emails
  for (i in begin:end) {
    nr <- as.character(i)

    #get sender
    rJython$exec(paste("typ, fro = mymail.fetch(\'", nr, "\', \'(BODY[HEADER.FIELDS (from)])\')", sep = ""))
    rJython$exec("fro = fro[0][1]")
    from <- .jstrVal(rJython$get("fro"))
    from <- unlist(strsplit(from, "[\r\n, \"]"))
  from <- sub("from: ", "", from, ignore.case = TRUE)
                            from <- grep("@", from, value = TRUE)
  #get addresees
  rJython$exec(paste("typ, to = mymail.fetch(\'", nr, "\', \'(BODY[HEADER.FIELDS (to)])\')", sep = ""))
  rJython$exec("to = to[0][1]")
  to <- .jstrVal(rJython$get("to"))
  to <- unlist(strsplit(to, "[\r\n, \"]"))
  to <- sub("to: ", "", to, ignore.case = TRUE)
  from <- sub("\"", "", from, ignore.case = TRUE)
  to <- grep("@", to, value = TRUE)

  #get dates:
  rJython$exec(paste("typ, date = mymail.fetch(\'", nr, "\', \'(BODY[HEADER.FIELDS (date)])\')", sep = ""))
  rJython$exec("date = date[0][1]")
  date <- .jstrVal(rJython$get("date"))

  #add to data frame
  #vec <- rep(from, length(to))
  if(length(to)==0)
  to <- 'NA'
  if(length(from)==0)
  to <- 'NA'
  data <- rbind(data, data.frame(from, to, date))

  #give some information about progress
  #print(i)
    if((i - begin) %% 100 == 0)
    {
      print(paste((i - begin)*(-1), "/", todownload,
                  " Downloading...", sep = ""))
    }
  }
  names(data) <- c("from", "to", "date")
  data$from <- tolower(data$from)
  data$to <- tolower(data$to)

  #close connection
  rJython$exec("mymail.shutdown()")
  return(data)
}

指定我想要下载电子邮件的文件夹

maild <- mailSoc("login", "passowrd", serv = "imap.gmail.com",
                 ntore = 20, todow = 200, folder='anywhere')

我收到错误消息:

[1] “Found NA emails”
[1] “I will download NA messages.”
[1] “It can take a while”
Error in begin:end : NA/NaN argument
In addition: Warning message:
In mailSoc(“xyz”, “xyz”, serv = “imap.gmail.com”, :
NAs introduced by coercion

你知道我该怎么办?我想在我的gmail中选择文件夹/子文件夹,接下来我要下载。

1 个答案:

答案 0 :(得分:0)

我找到了解决问题的方法。我需要做的是替换

rJython$exec(paste("sel = mymail.select(\"", folder,"\")", sep=""))

rJython$exec("sel = mymail.select('[Gmail]/All Mail')")

但是我还有另一个问题,我无法下载超过2500封电子邮件。也许你可以找到解决这个问题的方法......

[1] “Found 17976 emails”
[1] “I will download 2500 messages.”
[1] “It can take a while”
[1] “0/2500 Downloading…”
[1] “100/2500 Downloading…”
[1] “200/2500 Downloading…”
[1] “300/2500 Downloading…”
....MORE LINES....
[1] “2400/2500 Downloading…”
Error in data.frame(vec, to) :
arguments imply differing number of rows: 0, 1