Question

我正在尝试使用or

为相应的项目设置正确的位置

search_item = input("Input The Search Item: ")
location = input("Location:")
second_location = input("Second_Location:")

start_urls = [
        "https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + location,
        "https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + second_location
        # "https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + third_location,
        # "https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + fourth_location
    ]

item['locations'] = second_location or location

不确定我是否正确

second_location输入覆盖了所有项目，这不是我想要的。

它应该如何工作，对于相应的start_url，它应该具有相应的location或second_location。

完整代码

search_item = input("Input The Search Item: ")
location = input("Location:")
second_location = input("Second_Location:")



# city = [
#     "Los Angeles", "Chicago", "Houston", "Phoenix", "Philadelphia", "San Antonio", "Fort Worth", 
#     "San Diego", "Dallas", "San Jose", "Austin", "Columbus", "Indianapolis",  "Seattle", "St. Paul", "Nashville", 
#     "Louisville", "Plano"
# ]

# rancity = random.choice(city)


class YellowSpider(scrapy.Spider):


    name = "yellow"

    start_urls = [
        "https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + location,
        "https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + second_location
        # "https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + third_location,
        # "https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + fourth_location
    ]
    def __init__(self):
        self.seen_business_names = []
        self.seen_phonenumbers = []
        self.seen_websites = []
        self.seen_emails = []



    def parse(self, response):
        for href in response.css('div.v-card a.business-name::attr(href)'):
            yield response.follow(href, self.businessprofile)

        for href in response.css('div.pagination a::attr(href)'):
            yield response.follow(href, self.parse)

    def businessprofile(self, response):
        for business in response.css('header#main-header'):
            item = Item()
            item['business_name'] = business.css('div.sales-info h1::text').extract()
            w = business.css('a.secondary-btn.website-link::attr(href)').extract()

            item['website'] = str(w).strip('[]')
            item['locations'] = second_location or location

            # item['second_location'] = second_location
            # elif third_location:
            #     item['location'] = third_location
            # elif fourth_location:
            #     item['location'] = fourth_location
            s = business.css('a.email-business::attr(href)').extract()
            item['email'] = [item[7:] for item in s]

            item['phonenumber'] = business.css('p.phone::text').extract_first()
            for x in item['email']:
                #new code here, call to self.seen_business_names
                if x not in self.seen_emails:
                    if item['email']:
                        if item['phonenumber']:
                            if item['website']:
                                self.seen_emails.append(x)
                                yield item

Answer 1

我运行了你的短代码，如果我理解你是正确的，它似乎对我很好。我的start_urls变量最终为：
['https://www.yellowpages.com/search?search_terms=my_search_term&geo_location_terms=my_location1', 'https://www.yellowpages.com/search?search_terms=my_search_term&geo_location_terms=my_location2']

我在python 2.7和python 3.5中都尝试过它，它对我有用。我误解了你的问题吗？

python或没有放置正确的值

1 个答案: