ModuleNotFoundError:没有名为' ihs'

时间:2017-10-08 11:12:15

标签: python-3.x web-scraping scrapy

当我在anaconda提示符下运行爬虫时出现错误:

ModuleNotFoundError: no module named 'ihs'

Pyhon文件中有一部分如下:

from ihs.items import IhsItem
from ihs.library import Library

我应该安装哪个库来修复此错误?

完整代码:

import scrapy
import re
import logging
import os
import sys
import smtplib
import urlparse
from bs4 import BeautifulSoup
from scrapy.spider import BaseSpider
from datetime import datetime
from scrapy.exceptions import CloseSpider
from ihs.items import IhsItem
from ihs.library import Library

class McewingpartnersSpider(scrapy.Spider):

    name = "mcewingpartners"
    lib = Library()

    allowed_domains = ["mcewingpartners.com"]
    start_urls = (
        'http://www.mcewingpartners.com/?/rent/residential/',
    )

    def __init__(self):
        self.log_dir = os.getcwd()
        formatter = logging.Formatter('%(asctime)s %(message)s')
        self.log = logging.getLogger('log-mcewingpartners')
        self.log.setLevel(logging.INFO)

        log_file_name = datetime.now().strftime('logs/mcewingpartners%H:%M_%d-%m-%Y.log')
        ch_file = logging.FileHandler(log_file_name, 'w')
        ch_file.setLevel(logging.ERROR)
        ch_file.setFormatter(formatter)
        self.log.addHandler(ch_file)

        # add formatter to ch
        ch_stream = logging.StreamHandler()
        ch_stream.setFormatter(formatter)

        # add ch to logger
        self.log.addHandler(ch_stream)
        self.log.info("mcewingpartners Ready.")

    def parse(self, response):
        try:
            site = response.url
            domain = "mcewingpartners.com"
            if urlparse.urlparse(site).scheme == '':
                        if site.startswith('//') or site.startswith('\\'):
                                    site = 'http:' + site
                        else:
                                    site = 'http://' + site
            # Property Urls
            property_urls = []
            _property_urls = response.xpath("//a[@class='listing-item']/@href").extract()
            for _property_url in _property_urls:
                token = 'self.location'
                if token in _property_url[0:len(token) + 1]:
                    _property_url = _property_url[len(token) + 2:len(_property_url)-1]
                _property_url = urlparse.urljoin(site, _property_url)
                if _property_url not in property_urls:
                    property_urls.append(_property_url)

            # check last page
            if response.meta.get('prev_urls') == property_urls:
                return  # reached the last page
            for property_url in property_urls:
                yield scrapy.Request(property_url, callback = self.parse_property_urls, dont_filter = True)

            #next_page_url = response.xpath("//*[@id='pageNext2']/@href").extract()
            #if len(next_page_url) > 0:
            #   next_page_url = next_page_url[0].strip()
            #   if domain not in next_page_url:
            #       next_page_url = urlparse.urljoin(site, next_page_url)
            #   yield scrapy.Request(next_page_url, callback = self.parse)

        except  Exception as e:
            err = "Error in parse: %s, Message: %s, Traceback: %s " % (response.url, str(e), sys.exc_info())
            self.log.error(err)
            self.lib.PrintException('mcewingpartners')
            #self.lib.send_email(err, self.name)
        pass
    def parse_property_urls(self, response):
        try:
            site = response.url
            domain = "mcewingpartners.com"
            if urlparse.urlparse(site).scheme == '':
                if site.startswith('//'):
                    site = 'http:' + site
                else:
                    site = 'http://' + site

            item = IhsItem()
            item['spider_identifier'] = self.name
            item['property_url'] = response.url


            #Static Value
            item['property_type'] = 'Other Residential'


            #Static Value
            item['listing_type'] = 'For Rent'


            #Default value for short_description
            item['short_description'] = 'Please Contact Agent'
            short_description = response.xpath("//div[@class='col-md-24 property-description']//h3/text()").extract()
            if len(short_description) > 0:
                short_description = short_description[0].strip().encode('ascii','ignore')
                item['short_description'] = short_description.strip()


            #Default value for description
            item['description'] = 'Please Contact Agent'
            # Aggregation of multiple values from multiple xpaths
            description = []

            _description = response.xpath("//div[@class='col-md-24 property-description']//div[@class='inner border-bot']/p/text()").extract()
            for _pd in _description:
                if _pd not in description:
                    description.append(_pd)

            if len(description) > 0:
                item['description'] =  ' '.join(description).encode('ascii','ignore').strip()


            #Default value for price
            item['price'] = 'Please Contact Agent'
            price = response.xpath("//div[@class='row property-header']//div[@class='inner border-bot']//h4/text()").extract()
            if len(price) > 0:
                price = price[0].strip().encode('ascii','ignore')
                item['price'] = price.strip()


            #Default value for bedroom
            item['bedroom'] = '0'
            bedroom = response.xpath("(//div[@class='bbc-icon filter-bed']/text())[2]").extract()
            if len(bedroom) > 0:
                bedroom = bedroom[0].strip().encode('ascii','ignore')
                item['bedroom'] = bedroom.strip()


            #Default value for bathroom
            item['bathroom'] = '0'
            bathroom = response.xpath("(//div[@class='bbc-icon filter-bath']/text())[2]").extract()
            if len(bathroom) > 0:
                bathroom = bathroom[0].strip().encode('ascii','ignore')
                item['bathroom'] = bathroom.strip()


            #Default value for parking
            item['parking'] = '0'
            parking = response.xpath("(//div[@class='bbc-icon filter-car']/text())[2]").extract()
            if len(parking) > 0:
                parking = parking[0].strip().encode('ascii','ignore')
                item['parking'] = parking.strip()


            #Default value for photo
            item['photo'] = []
            # Aggregation of multiple values from multiple xpaths
            photo = []

            _photo = response.xpath("//div[@class='carousel-inner']//img/@src").extract()
            for _pd in _photo:
                if _pd not in photo:
                    photo.append(_pd)

            if len(photo) > 0:
                item['photo'] = photo


            #Default value for land_area
            item['land_area'] = '0'


            #Default value for building_area
            item['building_area'] = '0'


            #Default value for inspection_date
            #item['inspection_date'] = ''
            #inspection_date = response.xpath("//div[@class='font13 colorthreefontcolor left valueInsp']/text()").extract()
            #if len(inspection_date) > 0:
            #   inspection_date = inspection_date[0].strip().encode('ascii','ignore')
            #   item['inspection_date'] = inspection_date.strip()

            #self.parse_agent_info_url(self, response)

            #agent_info_url
            #agent_info_url = response.xpath("//div[@id='property_staffMember']/div[@class='left staffImageContainer']/a/@href").extract()
            #if len(agent_info_url) > 0:
            #   agent_info_url = agent_info_url[0].strip()
            #   agent_info_url = urlparse.urljoin(site, agent_info_url)

            scrapy.Request(site, callback=self.parse_agent_info_url, dont_filter=True, meta={'item': item})
            #else:
            #   # Default Value for items in the above branch url
            #   #Default value for agent_name
            #   item['agent_name'] = self.name.split('_')[0]
            #
                #Default value for agency_name
            #   item['agency_name'] = ''

                #Default value for agent_proprietor_name
            #   item['agent_proprietor_name'] = self.name.split('_')[0]

                #Default value for agent_licencee_number
            #   item['agent_licencee_number'] = self.name.split('_')[0]

                #Default value for agent_licencee_name
            #   item['agent_licencee_name'] = self.name.split('_')[0]

                #Default value for agency_logo
            #   item['agency_logo'] = ''

                #Default value for agency_email
            #   item['agency_email'] = ''

                #Default value for agent_email
            #   item['agent_email'] = 'ihomeseek@outlook.com'

                #Default value for agent_mobile
            #   item['agent_mobile'] = ''

                #Default value for agency_phone
            #   item['agency_phone'] = ''

                #Default value for agent_fax
            #   item['agent_fax'] = ''

                #Default value for agent_color
            #   item['agent_color'] = ''


            yield item
        except  Exception as e:
            err = "Error in parse_property_urls: %s, Message: %s, Traceback: %s " % (response.url, str(e), sys.exc_info())
            self.log.error(err)
            self.lib.PrintException('mcewingpartners')
            #self.lib.send_email(err, self.name)
        pass

    def parse_agent_info_url(self, response):
        try:
            site = response.url
            domain = "mcewingpartners.com"
            if site.startswith('//') or site.startswith('\\'):
                site = 'http:' + site
            else:
                site = 'http://' + site

            item = response.meta['item']
            #Default value for agent_name
            item['agent_name'] = self.name.split('_')[0]
            agent_name = response.xpath("//a[@class='agent-logo-inner']//img/@src").extract()
            if len(agent_name) > 0:
                agent_name = agent_name[0].strip().encode('ascii','ignore')
                item['agent_name'] = agent_name.strip()


            #Static Value
            item['agency_name'] = 'McEwing Partners'


            #Static Value
            item['agent_proprietor_name'] = 'McEwing Partners'


            #Static Value
            item['agent_licencee_number'] = 'xxxxxxx'


            #Static Value
            item['agent_licencee_name'] = 'xxxxxxx'


            #Default value for agency_logo
            item['agency_logo'] = ''
            agency_logo = response.xpath("//ul[@class='images']/li[@class='images']/img/@src").extract()
            if len(agency_logo) > 0:
                agency_logo = agency_logo[0].strip().encode('ascii','ignore')
                item['agency_logo'] = agency_logo.strip()


            #Default value for agency_email
            item['agency_email'] = ''
            agency_email = response.xpath("//*[@id='staffMember']/div[2]/span/a/text()").extract()
            if len(agency_email) > 0:
                agency_email = agency_email[0].strip().encode('ascii','ignore')
                item['agency_email'] = agency_email.strip()


            #Default value for agent_email
            item['agent_email'] = '** no mail **'
            agent_email = response.xpath("//*[@id='staffMember']/div[2]/span/a/text()").extract()
            if len(agent_email) > 0:
                agent_email = agent_email[0].strip().encode('ascii','ignore')
                item['agent_email'] = agent_email.strip()


            #Default value for agent_mobile
            item['agent_mobile'] = ''
            agent_mobile = response.xpath("//a[@class='property-staff-link']//span/text()").extract()
            if len(agent_mobile) > 0:
                agent_mobile = agent_mobile[0].strip().encode('ascii','ignore')
                item['agent_mobile'] = agent_mobile.strip()


            #Static Value
            item['agency_phone'] = '03 5975 4555'


            #Static Value
            item['agent_fax'] = '03 5975 6444'


            #Static Value
            item['agent_color'] = '#3F3F3F'


            yield item
        except  Exception as e:
            err = "Error in parse_agent_info_url: %s, Message: %s, Traceback: %s " % (response.url, str(e), sys.exc_info())
            self.log.error(err)
            self.lib.PrintException('mcewingpartners')
            #self.lib.send_email(err, self.name)
        pass


SPIDER = McewingpartnersSpider()

0 个答案:

没有答案