试图抓取这个网站。
https://www.foodpanda.sg/restaurants/new?lat=1.2915902&lng=103.8379066&vertical=restaurants
餐厅详情。
我需要向下滚动才能找到更多餐厅。加载新元素时如何避免重复?截至目前,它加载了新元素,但在 csv 中大多数都是重复的。我试过类似这样的东西,它在其他网站上也能用……但在这个网站上没有用。
current_len = len(likes_div.find_elements_by_xpath('//div[@class="q9uorilb"]//a'))
while True:
likes_div.find_element_by_xpath('.//div[@class="q9uorilb"]//a').send_keys(Keys.END)
try:
WebDriverWait(driver, 5).until(
lambda x: len(driver.find_elements_by_xpath('.//div[@class="q9uorilb"]//a')) > current_len)
current_len = len(driver.find_elements_by_xpath('.//div[@class="q9uorilb"]//a'))
except TimeoutException:
name_eles = [name_ele for name_ele in
driver.find_elements_by_xpath('.//div[@class="q9uorilb"]//a')]
这是我上面网页的代码。
def get_rest():
time.sleep(15)
restaurant_locator = '//ul[@class="vendor-list"]//li[@data-testid and not(@class)]'
restaurants = driver.find_elements_by_xpath(restaurant_locator)
return restaurants
def get_data(rests):
global header_added
for rest in rests:
try:
name = rest.find_element_by_xpath('.//span[@class="name fn"]').text
except:
name = 'No name'
print(name)
print('*********')
try:
link_a = rest.find_element_by_xpath('.//a')
link = link_a.get_attribute('href')
except:
link = 'No link available'
print(link)
print('**********')
try:
rating = rest.find_element_by_xpath('.//span[@class="rating"]').text
rating = rating[:-2]
except:
rating = 'No Ratings Available'
print(rating)
print('*********')
try:
cuisine = rest.find_element_by_xpath('.//ul[@class="categories summary"]').text
cuisine = cuisine[4:]
except:
cuisine = 'Cuisine Details Not Available'
print(cuisine)
print('***********')
try:
distance = rest.find_element_by_xpath('.//span[@class="badge-info"]').text
except:
distance = "No Distance available"
print(distance)
print('***********')
try:
tags = rest.find_element_by_xpath('.//div[@class="tag-container"]').text
except:
tags = "No special Offers"
print(tags)
print('************')
try:
cashback = rest.find_element_by_xpath('.//span[@class="vendor-cashback-info"]').text
except:
cashback = "No Cashback available"
print(cashback)
dict1 = {'Restaurant Name': name, "Rating": rating, "Cuisine": cuisine, "Delivery Time": distance,
"Tags": tags, "Cashback": cashback}
with open(f'Food_Panda_test.csv', 'a+', encoding='utf-8-sig') as f:
w = csv.DictWriter(f, dict1.keys())
if not header_added:
w.writeheader()
header_added = True
w.writerow(dict1)
while True:
p = get_rest()
get_data(p)
答案 0 :(得分:0)
我认为一旦您获得餐厅详细信息,就应该删除重复项:
假设您有一个包含重复项的列表 restaurants
。
像这样删除重复条目:
seen = set(restaurants)
if item not in seen:
seen.add(item)
restaurants.append(item)
如果您有多个列表,则创建一个通用函数并传递列表以完成工作。
更新 1:
执行此操作时 p = get_rest()
则 p 是一个列表。
这样做:
seen = set(p)
if item not in seen:
seen.add(item)
p.append(item)
现在所有重复项都将被删除。
然后做
get_data(p)
答案 1 :(得分:0)
我能够使用@cruisepandey 给出的想法解决这个问题。谢谢
old_rest = set() #Empty Set
while True:
driver.execute_script("window.scrollBy(0,3825)", "")
time.sleep(15)
restaurant_locator = '//ul[@class="vendor-list"]//li[@data-testid and not(@class)]'
restaurants = driver.find_elements_by_xpath(restaurant_locator) #Find Elements
ans = set(restaurants) - set(old_rest) #Remove old elements
for rest in ans:
driver.execute_script("arguments[0].scrollIntoView();", rest)
try:
name = rest.find_element_by_xpath('.//span[@class="name fn"]').text
except:
name = 'No name'
print(name)
print('*********')
try:
link_a = rest.find_element_by_xpath('.//a')
link = link_a.get_attribute('href')
except:
link = 'No link available'
print(link)
print('**********')
try:
rating = rest.find_element_by_xpath('.//span[@class="rating"]').text
rating = rating[:-2]
except:
rating = 'No Ratings Available'
print(rating)
print('*********')
try:
cuisine = rest.find_element_by_xpath('.//ul[@class="categories summary"]').text
cuisine = cuisine[4:]
except:
cuisine = 'Cuisine Details Not Available'
print(cuisine)
print('***********')
try:
distance = rest.find_element_by_xpath('.//span[@class="badge-info"]').text
except:
distance = "No Distance available"
print(distance)
print('***********')
try:
tags = rest.find_element_by_xpath('.//div[@class="tag-container"]').text
except:
tags = "No special Offers"
print(tags)
print('************')
try:
cashback = rest.find_element_by_xpath('.//span[@class="vendor-cashback-info"]').text
except:
cashback = "No Cashback available"
print(cashback)
dict1 = {'Restaurant Name': name, "URL":link, "Rating": rating, "Cuisine": cuisine, "Delivery Time": distance,
"Tags": tags, "Cashback": cashback}
with open(f'Food_Panda_test.csv', 'a+', encoding='utf-8-sig') as f:
w = csv.DictWriter(f, dict1.keys())
if not header_added:
w.writeheader()
header_added = True
w.writerow(dict1)
old_rest = restaurants #assign scraped items to old_rest