无法使用请求找到所需的HTML

时间:2016-11-11 11:36:48

标签: python web-scraping bs4

使用链接; image on flickr,请求只返回html到评论:

`<!-- rendered with love by pprd1-node580-lh1.manhattan.bf1.yahoo.com -->`

(见下面的html图片)。

我想访问下面img元素3 div元素中的链接,所以非常感谢任何输入。

from bs4 import BeautifulSoup
import logging
import os
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
import shutil
import sys
import time

logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - \
%(levelname)s - %(message)s")

def flickr_images():
    try:
        search_term, number_images = sys.argv[1:]
        num_req_images = int(number_images)
    except ValueError:
        print("Something went wrong. Command line input must be of \
format: 'filename searchterm numberimages'")
        return

    # navigate to search results page
    driver = webdriver.Firefox()
    # poll DOM for max 10 secs if element not immediately available
    driver.implicitly_wait(10)
    driver.get("https://www.flickr.com/search/?text=" + search_term)
    driver.maximize_window()

    # 0sec wait = 25images, 1sec = 48, 3+sec = 98
    time.sleep(3)
    image_link_elems = driver.find_elements_by_class_name("overlay")

    # Incase requested is > found
    num_images_tosave = min(req_images, len(image_link_elems))
    image_elems_tosave = image_link_elems[:num_images_tosave]
    print("{} images found.".format(num_images_tosave))
    logging.info("Length photos: {}".format(len(image_link_elems)))

    # extract image src's from found elements
    src_links = []
    image_links = [link.get_attribute("href") for link in image_elems_tosave]
    for image_link in image_links:
        res = requests.get(image_link)
        res.raise_for_status
        soup = bs4.BeautifulSoup(res.text, "html.parser")
        src_elem = soup.select(".zoom-small")

HTML图片:

enter image description here

0 个答案:

没有答案