当我在anaconda提示符下运行爬虫时出现错误:
ModuleNotFoundError: no module named 'ihs'
Pyhon文件中有一部分如下:
from ihs.items import IhsItem
from ihs.library import Library
我应该安装哪个库来修复此错误?
完整代码:
import scrapy
import re
import logging
import os
import sys
import smtplib
import urlparse
from bs4 import BeautifulSoup
from scrapy.spider import BaseSpider
from datetime import datetime
from scrapy.exceptions import CloseSpider
from ihs.items import IhsItem
from ihs.library import Library
class McewingpartnersSpider(scrapy.Spider):
name = "mcewingpartners"
lib = Library()
allowed_domains = ["mcewingpartners.com"]
start_urls = (
'http://www.mcewingpartners.com/?/rent/residential/',
)
def __init__(self):
self.log_dir = os.getcwd()
formatter = logging.Formatter('%(asctime)s %(message)s')
self.log = logging.getLogger('log-mcewingpartners')
self.log.setLevel(logging.INFO)
log_file_name = datetime.now().strftime('logs/mcewingpartners%H:%M_%d-%m-%Y.log')
ch_file = logging.FileHandler(log_file_name, 'w')
ch_file.setLevel(logging.ERROR)
ch_file.setFormatter(formatter)
self.log.addHandler(ch_file)
# add formatter to ch
ch_stream = logging.StreamHandler()
ch_stream.setFormatter(formatter)
# add ch to logger
self.log.addHandler(ch_stream)
self.log.info("mcewingpartners Ready.")
def parse(self, response):
try:
site = response.url
domain = "mcewingpartners.com"
if urlparse.urlparse(site).scheme == '':
if site.startswith('//') or site.startswith('\\'):
site = 'http:' + site
else:
site = 'http://' + site
# Property Urls
property_urls = []
_property_urls = response.xpath("//a[@class='listing-item']/@href").extract()
for _property_url in _property_urls:
token = 'self.location'
if token in _property_url[0:len(token) + 1]:
_property_url = _property_url[len(token) + 2:len(_property_url)-1]
_property_url = urlparse.urljoin(site, _property_url)
if _property_url not in property_urls:
property_urls.append(_property_url)
# check last page
if response.meta.get('prev_urls') == property_urls:
return # reached the last page
for property_url in property_urls:
yield scrapy.Request(property_url, callback = self.parse_property_urls, dont_filter = True)
#next_page_url = response.xpath("//*[@id='pageNext2']/@href").extract()
#if len(next_page_url) > 0:
# next_page_url = next_page_url[0].strip()
# if domain not in next_page_url:
# next_page_url = urlparse.urljoin(site, next_page_url)
# yield scrapy.Request(next_page_url, callback = self.parse)
except Exception as e:
err = "Error in parse: %s, Message: %s, Traceback: %s " % (response.url, str(e), sys.exc_info())
self.log.error(err)
self.lib.PrintException('mcewingpartners')
#self.lib.send_email(err, self.name)
pass
def parse_property_urls(self, response):
try:
site = response.url
domain = "mcewingpartners.com"
if urlparse.urlparse(site).scheme == '':
if site.startswith('//'):
site = 'http:' + site
else:
site = 'http://' + site
item = IhsItem()
item['spider_identifier'] = self.name
item['property_url'] = response.url
#Static Value
item['property_type'] = 'Other Residential'
#Static Value
item['listing_type'] = 'For Rent'
#Default value for short_description
item['short_description'] = 'Please Contact Agent'
short_description = response.xpath("//div[@class='col-md-24 property-description']//h3/text()").extract()
if len(short_description) > 0:
short_description = short_description[0].strip().encode('ascii','ignore')
item['short_description'] = short_description.strip()
#Default value for description
item['description'] = 'Please Contact Agent'
# Aggregation of multiple values from multiple xpaths
description = []
_description = response.xpath("//div[@class='col-md-24 property-description']//div[@class='inner border-bot']/p/text()").extract()
for _pd in _description:
if _pd not in description:
description.append(_pd)
if len(description) > 0:
item['description'] = ' '.join(description).encode('ascii','ignore').strip()
#Default value for price
item['price'] = 'Please Contact Agent'
price = response.xpath("//div[@class='row property-header']//div[@class='inner border-bot']//h4/text()").extract()
if len(price) > 0:
price = price[0].strip().encode('ascii','ignore')
item['price'] = price.strip()
#Default value for bedroom
item['bedroom'] = '0'
bedroom = response.xpath("(//div[@class='bbc-icon filter-bed']/text())[2]").extract()
if len(bedroom) > 0:
bedroom = bedroom[0].strip().encode('ascii','ignore')
item['bedroom'] = bedroom.strip()
#Default value for bathroom
item['bathroom'] = '0'
bathroom = response.xpath("(//div[@class='bbc-icon filter-bath']/text())[2]").extract()
if len(bathroom) > 0:
bathroom = bathroom[0].strip().encode('ascii','ignore')
item['bathroom'] = bathroom.strip()
#Default value for parking
item['parking'] = '0'
parking = response.xpath("(//div[@class='bbc-icon filter-car']/text())[2]").extract()
if len(parking) > 0:
parking = parking[0].strip().encode('ascii','ignore')
item['parking'] = parking.strip()
#Default value for photo
item['photo'] = []
# Aggregation of multiple values from multiple xpaths
photo = []
_photo = response.xpath("//div[@class='carousel-inner']//img/@src").extract()
for _pd in _photo:
if _pd not in photo:
photo.append(_pd)
if len(photo) > 0:
item['photo'] = photo
#Default value for land_area
item['land_area'] = '0'
#Default value for building_area
item['building_area'] = '0'
#Default value for inspection_date
#item['inspection_date'] = ''
#inspection_date = response.xpath("//div[@class='font13 colorthreefontcolor left valueInsp']/text()").extract()
#if len(inspection_date) > 0:
# inspection_date = inspection_date[0].strip().encode('ascii','ignore')
# item['inspection_date'] = inspection_date.strip()
#self.parse_agent_info_url(self, response)
#agent_info_url
#agent_info_url = response.xpath("//div[@id='property_staffMember']/div[@class='left staffImageContainer']/a/@href").extract()
#if len(agent_info_url) > 0:
# agent_info_url = agent_info_url[0].strip()
# agent_info_url = urlparse.urljoin(site, agent_info_url)
scrapy.Request(site, callback=self.parse_agent_info_url, dont_filter=True, meta={'item': item})
#else:
# # Default Value for items in the above branch url
# #Default value for agent_name
# item['agent_name'] = self.name.split('_')[0]
#
#Default value for agency_name
# item['agency_name'] = ''
#Default value for agent_proprietor_name
# item['agent_proprietor_name'] = self.name.split('_')[0]
#Default value for agent_licencee_number
# item['agent_licencee_number'] = self.name.split('_')[0]
#Default value for agent_licencee_name
# item['agent_licencee_name'] = self.name.split('_')[0]
#Default value for agency_logo
# item['agency_logo'] = ''
#Default value for agency_email
# item['agency_email'] = ''
#Default value for agent_email
# item['agent_email'] = 'ihomeseek@outlook.com'
#Default value for agent_mobile
# item['agent_mobile'] = ''
#Default value for agency_phone
# item['agency_phone'] = ''
#Default value for agent_fax
# item['agent_fax'] = ''
#Default value for agent_color
# item['agent_color'] = ''
yield item
except Exception as e:
err = "Error in parse_property_urls: %s, Message: %s, Traceback: %s " % (response.url, str(e), sys.exc_info())
self.log.error(err)
self.lib.PrintException('mcewingpartners')
#self.lib.send_email(err, self.name)
pass
def parse_agent_info_url(self, response):
try:
site = response.url
domain = "mcewingpartners.com"
if site.startswith('//') or site.startswith('\\'):
site = 'http:' + site
else:
site = 'http://' + site
item = response.meta['item']
#Default value for agent_name
item['agent_name'] = self.name.split('_')[0]
agent_name = response.xpath("//a[@class='agent-logo-inner']//img/@src").extract()
if len(agent_name) > 0:
agent_name = agent_name[0].strip().encode('ascii','ignore')
item['agent_name'] = agent_name.strip()
#Static Value
item['agency_name'] = 'McEwing Partners'
#Static Value
item['agent_proprietor_name'] = 'McEwing Partners'
#Static Value
item['agent_licencee_number'] = 'xxxxxxx'
#Static Value
item['agent_licencee_name'] = 'xxxxxxx'
#Default value for agency_logo
item['agency_logo'] = ''
agency_logo = response.xpath("//ul[@class='images']/li[@class='images']/img/@src").extract()
if len(agency_logo) > 0:
agency_logo = agency_logo[0].strip().encode('ascii','ignore')
item['agency_logo'] = agency_logo.strip()
#Default value for agency_email
item['agency_email'] = ''
agency_email = response.xpath("//*[@id='staffMember']/div[2]/span/a/text()").extract()
if len(agency_email) > 0:
agency_email = agency_email[0].strip().encode('ascii','ignore')
item['agency_email'] = agency_email.strip()
#Default value for agent_email
item['agent_email'] = '** no mail **'
agent_email = response.xpath("//*[@id='staffMember']/div[2]/span/a/text()").extract()
if len(agent_email) > 0:
agent_email = agent_email[0].strip().encode('ascii','ignore')
item['agent_email'] = agent_email.strip()
#Default value for agent_mobile
item['agent_mobile'] = ''
agent_mobile = response.xpath("//a[@class='property-staff-link']//span/text()").extract()
if len(agent_mobile) > 0:
agent_mobile = agent_mobile[0].strip().encode('ascii','ignore')
item['agent_mobile'] = agent_mobile.strip()
#Static Value
item['agency_phone'] = '03 5975 4555'
#Static Value
item['agent_fax'] = '03 5975 6444'
#Static Value
item['agent_color'] = '#3F3F3F'
yield item
except Exception as e:
err = "Error in parse_agent_info_url: %s, Message: %s, Traceback: %s " % (response.url, str(e), sys.exc_info())
self.log.error(err)
self.lib.PrintException('mcewingpartners')
#self.lib.send_email(err, self.name)
pass
SPIDER = McewingpartnersSpider()