Question

我想从kaggle中读取zip文件数据集，但无法读取该数据集：

import urllib 
urllib.request.urlretrieve("https://www.kaggle.com/himanshupoddar/zomato-bangalore-restaurants/downloads/zomato-bangalore-restaurants.zip", "/tmp/zomato-bangalore-restaurants.zip")

然后我运行shell脚本以提取文件：

%sh
unzip /tmp/zomato-bangalore-restaurants.zip
tail -n +2 zomato-bangalore-restaurants.csv > temp.csv
rm zomato-bangalore-restaurants.csv

然后我得到一个错误：

Archive:  /tmp/zomato-bangalore-restaurants.zip
  End-of-central-directory signature not found.  Either this file is not
  a zipfile, or it constitutes one disk of a multi-part archive.  In the
  latter case the central directory and zipfile comment will be found on
  the last disk(s) of this archive.
unzip:  cannot find zipfile directory in one of /tmp/zomato-bangalore-restaurants.zip or
        /tmp/zomato-bangalore-restaurants.zip.zip, and cannot find /tmp/zomato-bangalore-restaurants.zip.ZIP, period.
tail: cannot open 'zomato-bangalore-restaurants.csv' for reading: No such file or directory
rm: cannot remove 'zomato-bangalore-restaurants.csv': No such file or directory

Answer 1

注意：由于您尚未登录，因此无法从Kaggle下载文件。

这是下载所有比赛数据集的脚本。

from requests import get, post
from os import mkdir, remove
from os.path import exists
from shutil import rmtree
import zipfile

def purge_all_downloads(db_full_path):
  # Removes all the downloaded datasets
  if exists(db_full_path): rmtree(db_full_path)

def datasets_are_available_locally(db_full_path, datasets):
  # Returns True only if all the competition datasets are available locally in Databricks CE
  if not exists(db_full_path): return False
  for df in datasets:
    # Assumes all the datasets end with '.csv' extention
    if not exists(db_full_path + df + '.csv'): return False
  return True

def remove_zip_files(db_full_path, datasets):
  for df in datasets:
    remove(db_full_path + df + '.csv.zip')

def unzip(db_full_path, datasets):
  for df in datasets:
    with zipfile.ZipFile(db_full_path + df + '.csv.zip', 'r') as zf:
      zf.extractall(db_full_path)
  remove_zip_files(db_full_path, datasets)

def download_datasets(competition, db_full_path, datasets, username, password):
  # Downloads the competition datasets if not availible locally  
  if datasets_are_available_locally(db_full_path, datasets):
    print 'All the competition datasets have been downloaded, extraced and are ready for you !'
    return

  purge_all_downloads(db_full_path)
  mkdir(db_full_path)
  kaggle_info = {'UserName': username, 'Password': password}

  for df in datasets:
    url = (
      'https://www.kaggle.com/account/login?ReturnUrl=' +
      '/c/' + competition + '/download/'+ df + '.csv.zip'
    )
    request = post(url, data=kaggle_info, stream=True)

    # write data to local file
    with open(db_full_path + df + '.csv.zip', "w") as f:
      for chunk in request.iter_content(chunk_size = 512 * 1024):
        if chunk: f.write(chunk)

  # extract competition data 
  unzip(db_full_path, datasets)
  print('done !')

有关更多详细信息，请参阅“ Download the competition data sets directly”。

希望这会有所帮助。

如何读取数据块中的kaggle zip文件数据集

1 个答案: