我正在尝试使用or
search_item = input("Input The Search Item: ")
location = input("Location:")
second_location = input("Second_Location:")
start_urls = [
"https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + location,
"https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + second_location
# "https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + third_location,
# "https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + fourth_location
]
item['locations'] = second_location or location
不确定我是否正确
second_location
输入覆盖了所有项目,这不是我想要的。
它应该如何工作,对于相应的start_url,它应该具有相应的location
或second_location
。
完整代码
search_item = input("Input The Search Item: ")
location = input("Location:")
second_location = input("Second_Location:")
# city = [
# "Los Angeles", "Chicago", "Houston", "Phoenix", "Philadelphia", "San Antonio", "Fort Worth",
# "San Diego", "Dallas", "San Jose", "Austin", "Columbus", "Indianapolis", "Seattle", "St. Paul", "Nashville",
# "Louisville", "Plano"
# ]
# rancity = random.choice(city)
class YellowSpider(scrapy.Spider):
name = "yellow"
start_urls = [
"https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + location,
"https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + second_location
# "https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + third_location,
# "https://www.yellowpages.com/search?search_terms=" + search_item + "&geo_location_terms=" + fourth_location
]
def __init__(self):
self.seen_business_names = []
self.seen_phonenumbers = []
self.seen_websites = []
self.seen_emails = []
def parse(self, response):
for href in response.css('div.v-card a.business-name::attr(href)'):
yield response.follow(href, self.businessprofile)
for href in response.css('div.pagination a::attr(href)'):
yield response.follow(href, self.parse)
def businessprofile(self, response):
for business in response.css('header#main-header'):
item = Item()
item['business_name'] = business.css('div.sales-info h1::text').extract()
w = business.css('a.secondary-btn.website-link::attr(href)').extract()
item['website'] = str(w).strip('[]')
item['locations'] = second_location or location
# item['second_location'] = second_location
# elif third_location:
# item['location'] = third_location
# elif fourth_location:
# item['location'] = fourth_location
s = business.css('a.email-business::attr(href)').extract()
item['email'] = [item[7:] for item in s]
item['phonenumber'] = business.css('p.phone::text').extract_first()
for x in item['email']:
#new code here, call to self.seen_business_names
if x not in self.seen_emails:
if item['email']:
if item['phonenumber']:
if item['website']:
self.seen_emails.append(x)
yield item
答案 0 :(得分:0)
我运行了你的短代码,如果我理解你是正确的,它似乎对我很好。我的start_urls
变量最终为:
['https://www.yellowpages.com/search?search_terms=my_search_term&geo_location_terms=my_location1',
'https://www.yellowpages.com/search?search_terms=my_search_term&geo_location_terms=my_location2']
我在python 2.7和python 3.5中都尝试过它,它对我有用。 我误解了你的问题吗?