我正在研究使用硒(使用PhantomJS)的图像刮刀,由于某些原因,硒使我拍摄的一些图像的宽度和高度为零...当然,不可能有图像宽度和零高度?我该如何修复这个错误?网页
http://www.abacuswealth.com/
,例如,如果你看左下角有一个小方形图像,中间是黑色圆圈,中间是B ..这是一个BCorp标志。硒报告此徽标的宽度和高度为零。
谢谢
PS:下面是我的代码 - 请注意,我的大部分图像被硒刮下来的宽度和高度都是正确的,而其他图像的宽度和高度都是零。
#import necessary packages
import os
from scrapy.selector import Selector
from scrapy.contrib.exporter import CsvItemExporter
from scrapy.item import Item, Field
from scrapy.settings import Settings
from scrapy.settings import default_settings
from selenium import webdriver
from urlparse import urlparse
import csv
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy import log
#set maximum DEPTH_LIMIT to 3
default_settings.DEPTH_LIMIT = 3
#set Items Fields
#the contents of the Items are
#to be imported into the jin.csv file
class jinItem(Item):
SiteUrl = Field()
htmlImagesList = Field()
CSSImagesList = Field()
ImagesList = Field()
domain = Field()
depth = Field()
ImageIDList = Field()
WidthHeightList = Field()
PositionList =Field()
AreaList=Field()
totalNumberOfImages=Field()
totalAreaOfImages=Field()
ImagesFileNames = Field()
pass
#defining CrawlSpider
class MySpider(CrawlSpider):
name = "jin" #set CrawlSpider name to "Jin"
filename ="/Users/hyunjincho/Desktop/BCorp_Websites.csv"
filecontents = csv.reader(open(filename))
#get the list of urls that spider should start crawling from
start_urls = [''.join(f) for f in filecontents]
#get the list of allowed domains
allowed_domains = [''.join(f) for f in filecontents]
rules = [Rule(SgmlLinkExtractor(), callback='parse_Item')]
def parse_Item(self,response):
Items =[] #define empty Items basket
self.log('A response from %s just arrived!' % response.url)
driver=webdriver.PhantomJS('/usr/local/bin/phantomjs') #open PhantomJS...
#and attach it to the selenium driver
driver.get(response.url) # let the selenium driver get the url and parse it
Item = jinItem() #make instance of class jinItem
#all these mysterious methods(get_site_url(), get_html_images_list(), etc) are listed below...
Item['SiteUrl'] = self.get_site_url(driver)
Item["htmlImagesList"]=self.get_html_images_list(driver)
Item["CSSImagesList"]=self.get_css_images_list(driver)
Item["ImagesList"] = self.merge_images_lists(Item["htmlImagesList"],Item["CSSImagesList"])
Item["domain"] = self.get_domain(str(response.url))
Item["ImagesFileNames"]=self.get_html_images_file_name(Item["ImagesList"])
Item["depth"] = self.calculate_depth(Item["SiteUrl"])
Item["ImageIDList"]= self.get_ids(Item["ImagesList"])
Item["WidthHeightList"] = self.get_width_and_height(Item["ImagesList"])
Item["PositionList"]= self.get_position(Item["ImagesList"])
Item["AreaList"]= self.get_area(Item["WidthHeightList"])
Item["totalNumberOfImages"] = self.get_total_number_of_images(Item["ImagesList"])
Item["totalAreaOfImages"]= self.get_total_area_of_images(Item["AreaList"])
Items.append(Item)
return Items
#use css selector of selenium driver to get css images
def get_css_images_list(self,driver):
listOfImagesCSS= driver.find_elements_by_css_selector("img") #returns list of images
return listOfImagesCSS
#use selenium xpath method to get html method
def get_html_images_list(self,driver):
listOfimagesHTML = driver.find_elements_by_xpath('//img') #returns list of images
return listOfimagesHTML
#merge the two images list together - more specifically:
# the list that contains CSS images and the list that contains HTML images
def merge_images_lists(self, listOfImagesCSS, listOfImagesHTML):
listOfImages = listOfImagesCSS + listOfImagesHTML
return listOfImages
#use selenium driver get_attribute() method to get file name of the image
def get_html_images_file_name(self,listOfImages):
filenames=range(0,len(listOfImages))
for j in range(0,len(listOfImages)):
filepath = listOfImages[j].get_attribute("src")
path,filenames[j]=os.path.split(filepath)
return filenames
#calculate depth
def calculate_depth(self,siteUrl):
depth=siteUrl.count("/")-2
return depth
#get domain(URL of main website)
def get_domain(self,url):
parsed_uri = urlparse(url)
domain = '{uri.scheme}://{uri.netloc}'.format(uri=parsed_uri)
return domain
#get the URL of sub-website
def get_site_url(self,driver):
url = driver.current_url
return url
#get id of images
def get_ids(self,listOfImages):
ids=range(0,len(listOfImages))
for j in range(0,len(listOfImages)):
ids[j] = listOfImages[j].id #returns list of image ids
return ids
#get width and height of images
def get_width_and_height(self,listOfImages):
sizes=range(0,len(listOfImages))
for j in range(0,len(listOfImages)):
sizes[j] = listOfImages[j].size #returns something like:
#{'width': 77, 'height': 2}
return sizes
#get x and y offset (coordinates)
def get_position(self,listOfImages):
position=range(0,len(listOfImages))
for j in range(0,len(listOfImages)):
position[j] = listOfImages[j].location #returns list of:
#x and y coordinate of images
return position
#calculate area
#based on the width and height of images obtained through
#the get_width_and_height() method
def get_area(self,listOfSizes):
areas = range(0,len(listOfSizes))
for i in range(0,len(listOfSizes)):
width =listOfSizes[i]["width"]
height = listOfSizes[i]["height"]
areas[i] = width*height #area is a list of area of images
return areas
#get total number of images found in a certain website
def get_total_number_of_images(self,listOfImage):
totalNumberOfImages = len(listOfImage)
return totalNumberOfImages
#get total area that all the images found in website takes place
def get_total_area_of_images(self,listOfArea):
sumOfArea = 0
for j in range(0,len(listOfArea)):
sumOfArea = sumOfArea+listOfArea[j]
totalAreaOfImages = sumOfArea #sum of the area of images
return totalAreaOfImages
#...let the spider Crawl!!!!!!
MySpider()