从Web获取许多图像文件并以异步方式保存它们

时间:2017-08-05 20:54:01

标签: python python-asyncio

我有一个图像文件的Web URL列表。我希望获取所有图像文件并将它们分别写入相应的目录。图像都是PNG。在测试程序中,我能够同步成功获取单个图像:

import urllib.request
import shutil

# This example will download a single traffic image.

# Spoof a well-known browser so servers won't think I am a bot.
class AppURLopener(urllib.request.FancyURLopener):
    version = "Mozilla/5.0"

def getTrafficImage(fromUrl, toPath): 
    baseUrl = "https://mass511.com/map/Cctv/"
    url = f"{baseUrl}{fromUrl}"
    opener = AppURLopener()
    # Request image file from remote server and save to disk locally.
    with opener.open(url) as response, open(toPath, 'wb') as out_file:
        shutil.copyfileobj(response, out_file)

# Camera label on MASS511:
#   I-93-SB-Somerville-Exit 26 Storrow Dr
url = "406443--1"

# Path to store the file
file_name = "C:/Users/pchernoch/projects/HackWeek/traffic-feeds/I-93-SB-Somerville-Exit.png"

getTrafficImage(url, file_name)

如何为多个网址重复此操作并让每次提取异步执行?

如果无法获取任何图像或出现错误(如超时),我希望将该错误记录到控制台,但不会停止处理其他文件。

我正在使用 Python 3.6.2 。我的偏好是使用新的async / await方法和 aiohttp asyncio 库。但是,任何流行的异步库(.e.g.curio)都可以。我只用Python编程了一个星期,这太令人困惑了。这个答案看起来很有用,但我不知道如何使用它:asyncio web scraping 101: fetching multiple urls with aiohttp

目标:要完成的任务是在一段时间内每隔几秒从许多波士顿摄像机捕捉交通摄像头图像。

以下是我正在尝试编写的程序,使用TODO:标记在我难倒的地方。该程序在计时器上运行。每隔几秒钟,它将从每个交通摄像头捕获另一组图像。计时器循环不是异步的,但我希望许多URL的图像捕获是异步的。

import sys, os, datetime, threading, time
import urllib.request
import shutil

# ==================
#    Configuration
# ==================

# Identify the name of the camera with its URL on Mass511 web site
CAMERA_URLS = {
  "I-93-SB-Somerville-Exit 26 Storrow Dr": "406443--1",
  "Road STWB-WB-TNL-Storrow WB": "1407--1",
  "I-93-NB-Dorchester-between x14 & x15 Savin": "406557"
  }

# All cameras have URLs that begin with this prefix
BASE_URL = "https://mass511.com/map/Cctv/"

# Store photos in subdirectories under this directory
PHOTO_STORAGE_DIR = "C:/Users/pchernoch/projects/HackWeek/traffic-feeds"

# Take this many pictures from each camera
SNAP_COUNT = 5

# Capture new set of pictures after this many seconds 
POLLING_INTERVAL_SECONDS = 2

# ==================
#      Classes
# ==================

def logMessage(msg):
    sys.stdout.write(msg + '\n')
    sys.stdout.flush()

# Change the presumed name of the browser to fool robot detectors
class AppURLopener(urllib.request.FancyURLopener):
    version = "Mozilla/5.0"

# Can Read file from one camera and save to a file
class Camera(object):
  def __init__(self, sourceUrl, targetDirectory, name, extension):
    self.SourceUrl = sourceUrl
    self.TargetDirectory = targetDirectory
    self.Name = name
    self.Extension = extension

  def TargetFile(self, time):
    timeStamp = time.strftime("%Y-%m-%d-%H-%M-%S") 
    return f"{self.TargetDirectory}/{timeStamp}.{self.Extension}"

  def Get(self):
      fileName = self.TargetFile(datetime.datetime.now())
      logMessage(f"  - For camera {self.Name}, get {self.SourceUrl} and save as {fileName}")
      # TODO: GET IMAGE FILE FROM WEB AND SAVE IN FILE HERE

# Can poll multiple cameras once
class CameraPoller(object):
  def __init__(self, urlMap, baseUrl, rootDir):
    self.CamerasToRead = []
    for cameraName, urlSuffix in urlMap.items():
      url = f"{baseUrl}{urlSuffix}"
      targetDir = f"{rootDir}/{cameraName}"
      if not os.path.exists(targetDir):
        os.makedirs(targetDir)
      camera = Camera(url, targetDir, cameraName, "png")
      self.CamerasToRead.append(camera)

  def Snap(self):
    # TODO: MAKE THIS LOOP ASYNC
    for camera in self.CamerasToRead:
      camera.Get()

# Repeatedly poll all cameras, then sleep
def get_images(poller, pollingInterval, snapCount):
    next_call = time.time()
    for i in range(0, snapCount):
        now = datetime.datetime.now()
        timeString = now.strftime("%Y-%m-%d-%H-%M-%S") 
        logMessage(f"\nPoll at {timeString}")
        poller.Snap()
        next_call = next_call + pollingInterval
        time.sleep(next_call - time.time())

# ==================
#    Application
# ==================

if __name__ == "__main__":

    cameraPoller = CameraPoller(CAMERA_URLS, BASE_URL, PHOTO_STORAGE_DIR)

    # Poll cameras i na separate thread. It is a daemon, so when the main thread exits, it will stop.
    timerThread = threading.Thread(target=get_images, args=([cameraPoller, POLLING_INTERVAL_SECONDS, SNAP_COUNT]))
    timerThread.daemon = False
    timerThread.start()

    timerThread.join()

    endTime = datetime.datetime.now()
    endTimeString = endTime.strftime("%Y-%m-%d-%H-%M-%S") 
    logMessage(f"Exiting Poller at {endTimeString}")

2 个答案:

答案 0 :(得分:1)

这是相同的代码,使用ThreadPoolExecutor完成URL抓取。 它需要对我的代码进行最少的更改。感谢@larsks指出我正确的方向。

import sys, os, datetime, threading, time
import urllib.request
from concurrent.futures import ThreadPoolExecutor
import shutil

# ==================
#    Configuration
# ==================

# Identify the name of the camera with its URL on Mass511 web site
CAMERA_URLS = {
  "I-93-SB-Somerville-Exit 26 Storrow Dr": "406443--1",
  "Road STWB-WB-TNL-Storrow WB": "1407--1",
  "I-93-NB-Dorchester-between x14 & x15 Savin": "406557"
  }

# All cameras have URLs that begin with this prefix
BASE_URL = "https://mass511.com/map/Cctv/"

# Store photos in subdirectories under this directory
PHOTO_STORAGE_DIR = "C:/Users/pchernoch/projects/HackWeek/traffic-feeds"

# Take this many pictures from each camera
SNAP_COUNT = 5

# Capture new set of pictures after this many seconds 
POLLING_INTERVAL_SECONDS = 2

# ==================
#      Classes
# ==================

def logMessage(msg):
    sys.stdout.write(msg + '\n')
    sys.stdout.flush()

# Change the presumed name of the browser to fool robot detectors
class AppURLopener(urllib.request.FancyURLopener):
    version = "Mozilla/5.0"

# Can Read file from one camera and save to a file
class Camera(object):
  def __init__(self, sourceUrl, targetDirectory, name, extension):
    self.SourceUrl = sourceUrl
    self.TargetDirectory = targetDirectory
    self.Name = name
    self.Extension = extension

  def TargetFile(self, time):
    timeStamp = time.strftime("%Y-%m-%d-%H-%M-%S") 
    return f"{self.TargetDirectory}/{timeStamp}.{self.Extension}"

  def Get(self):
      fileName = self.TargetFile(datetime.datetime.now())
      message = f"  - For camera {self.Name}, get {self.SourceUrl} and save as {fileName}"
      # Request image file from remote server and save to disk locally.
      opener = AppURLopener()
      with opener.open(self.SourceUrl) as response, open(fileName, 'wb') as out_file:
        shutil.copyfileobj(response, out_file)
      logMessage(message)
      return message

def snap_picture(camera):
  return camera.Get()


# Can poll multiple cameras once
class CameraPoller(object):
  def __init__(self, urlMap, baseUrl, rootDir):
    self.CamerasToRead = []
    for cameraName, urlSuffix in urlMap.items():
      url = f"{baseUrl}{urlSuffix}"
      targetDir = f"{rootDir}/{cameraName}"
      if not os.path.exists(targetDir):
        os.makedirs(targetDir)
      camera = Camera(url, targetDir, cameraName, "png")
      self.CamerasToRead.append(camera)

  def Snap(self):
    with ThreadPoolExecutor(max_workers=10) as executor:
      results = executor.map(snap_picture, self.CamerasToRead)

# Repeatedly poll all cameras, then sleep
def get_images(poller, pollingInterval, snapCount):
    next_call = time.time()
    for i in range(0, snapCount):
        now = datetime.datetime.now()
        timeString = now.strftime("%Y-%m-%d-%H-%M-%S") 
        logMessage(f"\nPoll at {timeString}")
        poller.Snap()
        next_call = next_call + pollingInterval
        time.sleep(next_call - time.time())

# ==================
#    Application
# ==================

if __name__ == "__main__":

    cameraPoller = CameraPoller(CAMERA_URLS, BASE_URL, PHOTO_STORAGE_DIR)

    # Poll cameras i na separate thread. It is a daemon, so when the main thread exits, it will stop.
    timerThread = threading.Thread(target=get_images, args=([cameraPoller, POLLING_INTERVAL_SECONDS, SNAP_COUNT]))
    timerThread.daemon = False
    timerThread.start()

    timerThread.join()

    endTime = datetime.datetime.now()
    endTimeString = endTime.strftime("%Y-%m-%d-%H-%M-%S") 
    logMessage(f"Exiting Poller at {endTimeString}")

答案 1 :(得分:0)

这是一个asyncio版本。未经测试,但不应该太远。

使用asyncio,基本上您可以启动所有任务,并使用asyncio.gather收集结果。但同时启动大量请求将无法正常工作,因此我还在Semaphore中添加了CameraPoller:这可确保最多可以运行10个并发请求

import asyncio
import aiohttp
import datetime
import time


# ==================
#    Configuration
# ==================

# Identify the name of the camera with its URL on Mass511 web site
CAMERA_URLS = {
  "I-93-SB-Somerville-Exit 26 Storrow Dr": "406443--1",
  "Road STWB-WB-TNL-Storrow WB": "1407--1",
  "I-93-NB-Dorchester-between x14 & x15 Savin": "406557"
}

# All cameras have URLs that begin with this prefix
BASE_URL = "https://mass511.com/map/Cctv/"

# Store photos in subdirectories under this directory
PHOTO_STORAGE_DIR = "C:/Users/pchernoch/projects/HackWeek/traffic-feeds"

# Take this many pictures from each camera
SNAP_COUNT = 5

# Capture new set of pictures after this many seconds 
POLLING_INTERVAL_SECONDS = 2

USER_AGENT = 'Mozilla/5.0'

# ==================
#      Classes
# ==================

def logMessage(msg):
  print(msg)

# Can Read file from one camera and save to a file
class Camera:
  def __init__(self, session, sourceUrl, targetDirectory, name, extension):
    self.session = session
    self.SourceUrl = sourceUrl
    self.TargetDirectory = targetDirectory
    self.Name = name
    self.Extension = extension

  def TargetFile(self, time):
    timeStamp = time.strftime("%Y-%m-%d-%H-%M-%S") 
    return f"{self.TargetDirectory}/{timeStamp}.{self.Extension}"

  async def Get(self):
      fileName = self.TargetFile(datetime.datetime.now())
      message = 
      # Request image file from remote server
      async with self.session.get(self.SourceUrl, headers={'User-Agent': USER_AGENT}) as resp:
        data = await resp.read()
      # and save to disk locally.
      with open(fileName, 'wb') as out_file:
        out_file.write(data)
      logMessage(f"  - For camera {self.Name}, get {self.SourceUrl} and save as {fileName}")


# Can poll multiple cameras once
class CameraPoller:
  def __init__(self, session, urlMap, baseUrl, rootDir, concurrency=10):
    self.CamerasToRead = []
    for cameraName, urlSuffix in urlMap.items():
      url = f"{baseUrl}{urlSuffix}"
      targetDir = f"{rootDir}/{cameraName}"
      if not os.path.exists(targetDir):
        os.makedirs(targetDir)
      camera = Camera(session, url, targetDir, cameraName, "png")
      self.CamerasToRead.append(camera)

    self.sem = asyncio.BoundedSemaphore(concurrency)

  async def _snap(self, camera):
    async with self.sem:
      await camera.Get()

  async def Snap(self):
    await asyncio.gather(*(self._snap(cam) for cam in self.CamerasToRead))

  # Repeatedly poll all cameras, then sleep
  async def poll(self, pollingInterval, snapCount):
    loop = asyncio.get_event_loop()
    next_call = loop.time()
    for i in range(0, snapCount):
        now = datetime.datetime.now()
        timeString = now.strftime("%Y-%m-%d-%H-%M-%S") 
        logMessage(f"\nPoll at {timeString}")

        await self.Snap()

        next_call = next_call + pollingInterval
        await asyncio.sleep(next_call - loop.time())

# ==================
#    Application
# ==================

async def main():
  async with aiohttp.ClientSession as session:
    poller = ameraPoller(session, CAMERA_URLS, BASE_URL, PHOTO_STORAGE_DIR)
    await poller.poll(POLLING_INTERVAL_SECONDS, SNAP_COUNT)

  endTime = datetime.datetime.now()
  endTimeString = endTime.strftime("%Y-%m-%d-%H-%M-%S") 
  logMessage(f"Exiting Poller at {endTimeString}")


if __name__ == "__main__":
  loop = asyncio.get_event_loop()
  loop.run_until_complete(main())