在Python中从网站下载csv文件的问题

时间:2015-02-26 22:44:39

标签: python csv urllib

因此,我尝试从需要我的电子邮件地址和密码作为网站身份验证的网站下载并将csv文件写入我的计算机。我有以下代码:

import cStringIO
import pycurl
import urllib

url = 'http://www.riglocator.ca/report=rig%2Frig%2D150226%2Ecsv'



def GetPage(url, proxy=None):
  if proxy:
    port = 8888
    proxy = proxy.replace("socks://", "")
    if ":" in proxy:
      port = int(proxy.rsplit(":", 1)[1])
      proxy = proxy.rsplit(":", 1)[0]
  try:
    buf = cStringIO.StringIO()
    c = pycurl.Curl()
    c.setopt(c.URL, url)
    c.setopt(c.WRITEFUNCTION, buf.write)
    c.setopt(c.CONNECTTIMEOUT, 5)
    c.setopt(c.TIMEOUT, 8)
    if proxy:
      c.setopt(pycurl.PROXY, proxy)
      c.setopt(pycurl.PROXYPORT, port)
      c.setopt(pycurl.PROXYTYPE, pycurl.PROXYTYPE_SOCKS5)
      c.setopt(pycurl.USERPWD, 'john@mail.com:password123')
    c.setopt(c.FOLLOWLOCATION, True)
    c.perform()
    c.close()
    results = buf.getvalue()
    buf.close()
  except:
    results = ""
  return results

GetPage(url,"socks://127.0.0.1:8888")

def loader():
    csv_url = GetPage(url,"socks://127.0.0.1:8888")
    r = urllib.urlopen(csv_url)
    print(r)
    csv = r.read()
    csv_str = str(csv)
    lines = csv_str.split('\\n')
    dest_url = r'mapfile.csv'
    fx = open(dest_url, 'w')
    for line in lines:
        fx.write(line + '\n')
    fx.close()

loader()

但是这仍然会从登录页面返回HTML代码,有什么建议吗?

我收到此错误:

     File "C:/Users/cevans/PycharmProjects/RigLocatorMapPull/rigmapscrape.py", line 55, in <module>
loader()
  File "C:/Users/cevans/PycharmProjects/RigLocatorMapPull/rigmapscrape.py", line 44, in loader
    r = urllib.urlopen(csv_url)
  File "C:\Python27\lib\urllib.py", line 87, in urlopen
    return opener.open(url)
  File "C:\Python27\lib\urllib.py", line 208, in open
    return getattr(self, name)(url)
  File "C:\Python27\lib\urllib.py", line 463, in open_file
    return self.open_local_file(url)
  File "C:\Python27\lib\urllib.py", line 477, in open_local_file
    raise IOError(e.errno, e.strerror, e.filename)
IOError: [Errno 2] The system cannot find the path specified: ''

Process finished with exit code 1

2 个答案:

答案 0 :(得分:0)

这是我用pycurl抓取文件的一些代码的链接,它基本上应该做你需要做的事情。您只需添加选项c.setopt(pycurl.USERPWD,&#39; username:userpass&#39;)执行我的代码来设置您的用户名和密码。

http://prestongarrison.com/proper-python-pycurl-example/

答案 1 :(得分:0)

#This is a solution using the Mechanize browser library which takes the url, 
#changes it to the current date, submits the username/password in a form, 
#downloads a csv and writes it to a folder location: 

__author__ = 'cevans'

import mechanize
import os
import cookielib
import datetime, string

USERNAME = 'xxxx'
PASSWORD = 'xxxxx'
OLDURL = 'http://www.oldurl.com/report050301'
folder = r'\\Driver'

def loader():
#Takes current date and changes URL to grab correct datefile (Schedule only runs on day of week)
    cdate = str(datetime.date.today().strftime("%y%m%d"))
    DATAURL = string.replace(OLDURL,'150301',cdate)

# Browser and Cookie Jar
    br = mechanize.Browser()
    cj = cookielib.LWPCookieJar()
    br.set_cookiejar(cj)
# Browser options
    br.set_handle_equiv(True)
    br.set_handle_gzip(False)
    br.set_handle_redirect(True)
    br.set_handle_referer(True)
    br.set_handle_robots(True)
# Follows refresh 0 but not hangs on refresh > 0
    br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
# Opens site:
    r = br.open(DATAURL)
    html = r.read()
    br.select_form(nr=0)
    br.form['nauthemail']= USERNAME
    br.form['password']=PASSWORD
    br.submit()
    r = br.open(DATAURL)
#Read and write file to csv, in folder

    csv = r.read()
    csv_str = str(csv)
    lines = csv_str.split('\\n')
    fname = 'map-'+ cdate
    base_filename=fname
    filename_suffix = '.csv'
    folder1 = os.path.join(folder, base_filename + filename_suffix)
    dest_url = folder1
    fx = open(dest_url, 'w')
    for line in lines:
    fx.write(line + '\n')
    fx.close()

loader()