硒使图像高度和宽度为零

时间:2013-12-13 23:35:43

标签: selenium phantomjs

我正在研究使用硒(使用PhantomJS)的图像刮刀,由于某些原因,硒使我拍摄的一些图像的宽度和高度为零...当然,不可能有图像宽度和零高度?我该如何修复这个错误?网页 http://www.abacuswealth.com/,例如,如果你看左下角有一个小方形图像,中间是黑色圆圈,中间是B ..这是一个BCorp标志。硒报告此徽标的宽度和高度为零。

谢谢

PS:下面是我的代码 - 请注意,我的大部分图像被硒刮下来的宽度和高度都是正确的,而其他图像的宽度和高度都是零。

#import necessary packages
import os
from scrapy.selector import Selector
from scrapy.contrib.exporter import CsvItemExporter
from scrapy.item import Item, Field
from scrapy.settings import Settings
from scrapy.settings import default_settings 
from selenium import webdriver
from urlparse import urlparse
import csv    
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy import log

#set maximum DEPTH_LIMIT to 3
default_settings.DEPTH_LIMIT = 3

#set Items Fields
#the contents of the Items are 
#to be imported into the jin.csv file   
class jinItem(Item):
    SiteUrl = Field()
    htmlImagesList = Field()
    CSSImagesList = Field()
    ImagesList = Field()
    domain = Field()
    depth = Field()
    ImageIDList = Field()
    WidthHeightList = Field()
    PositionList =Field()
    AreaList=Field()
    totalNumberOfImages=Field()
    totalAreaOfImages=Field()
    ImagesFileNames = Field()
    pass

#defining CrawlSpider
class MySpider(CrawlSpider):

name = "jin"    #set CrawlSpider name to "Jin"
filename ="/Users/hyunjincho/Desktop/BCorp_Websites.csv"
filecontents = csv.reader(open(filename))
#get the list of urls that spider should start crawling from
start_urls = [''.join(f) for f in filecontents] 
#get the list of allowed domains 
allowed_domains = [''.join(f) for f in filecontents]
rules = [Rule(SgmlLinkExtractor(), callback='parse_Item')]

def parse_Item(self,response):
    Items =[]      #define empty Items basket
    self.log('A response from %s just arrived!' % response.url)
    driver=webdriver.PhantomJS('/usr/local/bin/phantomjs')   #open PhantomJS...
                                                             #and attach it to the selenium driver
    driver.get(response.url) # let the selenium driver get the url and parse it
    Item = jinItem()  #make instance of class jinItem

    #all these mysterious methods(get_site_url(), get_html_images_list(), etc) are listed below...
    Item['SiteUrl'] = self.get_site_url(driver)    
    Item["htmlImagesList"]=self.get_html_images_list(driver)
    Item["CSSImagesList"]=self.get_css_images_list(driver)
    Item["ImagesList"] = self.merge_images_lists(Item["htmlImagesList"],Item["CSSImagesList"])
    Item["domain"] = self.get_domain(str(response.url))  
    Item["ImagesFileNames"]=self.get_html_images_file_name(Item["ImagesList"])     
    Item["depth"] = self.calculate_depth(Item["SiteUrl"])
    Item["ImageIDList"]= self.get_ids(Item["ImagesList"])
    Item["WidthHeightList"] = self.get_width_and_height(Item["ImagesList"])
    Item["PositionList"]= self.get_position(Item["ImagesList"])
    Item["AreaList"]= self.get_area(Item["WidthHeightList"])
    Item["totalNumberOfImages"] = self.get_total_number_of_images(Item["ImagesList"])
    Item["totalAreaOfImages"]= self.get_total_area_of_images(Item["AreaList"])  
    Items.append(Item)
    return Items

#use css selector of selenium driver to get css images
def get_css_images_list(self,driver):
    listOfImagesCSS= driver.find_elements_by_css_selector("img") #returns list of images
    return listOfImagesCSS

#use selenium xpath method to get html method 
def get_html_images_list(self,driver):
    listOfimagesHTML = driver.find_elements_by_xpath('//img') #returns list of images
    return listOfimagesHTML

#merge the two images list together - more specifically:
# the list that contains CSS images and the list that contains HTML images
def merge_images_lists(self, listOfImagesCSS, listOfImagesHTML):
    listOfImages = listOfImagesCSS + listOfImagesHTML
    return listOfImages

#use selenium driver get_attribute() method to get file name of the image
def get_html_images_file_name(self,listOfImages):
    filenames=range(0,len(listOfImages))
    for j in range(0,len(listOfImages)):
        filepath = listOfImages[j].get_attribute("src")
        path,filenames[j]=os.path.split(filepath)
    return filenames 

#calculate depth        
def calculate_depth(self,siteUrl):
    depth=siteUrl.count("/")-2
    return depth

#get domain(URL of main website)
def get_domain(self,url):
    parsed_uri = urlparse(url)
    domain = '{uri.scheme}://{uri.netloc}'.format(uri=parsed_uri)
    return domain

#get the URL of sub-website    
def get_site_url(self,driver):
    url = driver.current_url
    return url

#get id of images
def get_ids(self,listOfImages): 
    ids=range(0,len(listOfImages)) 
    for j in range(0,len(listOfImages)):
        ids[j] = listOfImages[j].id #returns list of image ids
    return ids

#get width and height of images
def get_width_and_height(self,listOfImages):
    sizes=range(0,len(listOfImages))  
    for j in range(0,len(listOfImages)):
        sizes[j] = listOfImages[j].size #returns something like: 
                                        #{'width': 77, 'height': 2}
    return sizes

#get x and y offset (coordinates)
def get_position(self,listOfImages):
    position=range(0,len(listOfImages))
    for j in range(0,len(listOfImages)):
        position[j] = listOfImages[j].location #returns list of:
                                               #x and y coordinate of images 
    return position

#calculate area
#based on the width and height of images obtained through
#the get_width_and_height() method    
def get_area(self,listOfSizes):
    areas  = range(0,len(listOfSizes))
    for i in range(0,len(listOfSizes)): 
        width =listOfSizes[i]["width"]
        height = listOfSizes[i]["height"]
        areas[i] = width*height #area is a list of area of images
    return areas

#get total number of images found in a certain website
def get_total_number_of_images(self,listOfImage):
    totalNumberOfImages = len(listOfImage)
    return totalNumberOfImages

#get total area that all the images found in website takes place
def get_total_area_of_images(self,listOfArea):    
    sumOfArea = 0
    for j in range(0,len(listOfArea)):
        sumOfArea = sumOfArea+listOfArea[j]
    totalAreaOfImages = sumOfArea #sum of the area of images
    return totalAreaOfImages    

#...let the spider Crawl!!!!!!
MySpider()

0 个答案:

没有答案