使用无限滚动抓取页面时避免重复元素

时间:2021-06-09 14:52:32

标签: python selenium web-scraping

试图抓取这个网站。

https://www.foodpanda.sg/restaurants/new?lat=1.2915902&lng=103.8379066&vertical=restaurants

餐厅详情。

我需要向下滚动才能找到更多餐厅。加载新元素时如何避免重复?截至目前,它加载了新元素,但在 csv 中大多数都是重复的。我试过类似这样的东西,它在其他网站上也能用……但在这个网站上没有用。

        current_len = len(likes_div.find_elements_by_xpath('//div[@class="q9uorilb"]//a'))
        while True:
            likes_div.find_element_by_xpath('.//div[@class="q9uorilb"]//a').send_keys(Keys.END)
            try:
                WebDriverWait(driver, 5).until(
                    lambda x: len(driver.find_elements_by_xpath('.//div[@class="q9uorilb"]//a')) > current_len)
                current_len = len(driver.find_elements_by_xpath('.//div[@class="q9uorilb"]//a'))
            except TimeoutException:
                name_eles = [name_ele for name_ele in
                             driver.find_elements_by_xpath('.//div[@class="q9uorilb"]//a')]

这是我上面网页的代码。

def get_rest():

    time.sleep(15)
    restaurant_locator = '//ul[@class="vendor-list"]//li[@data-testid and not(@class)]'
    restaurants = driver.find_elements_by_xpath(restaurant_locator)
    return restaurants



def get_data(rests):
    global header_added
    for rest in rests:
        try:
            name = rest.find_element_by_xpath('.//span[@class="name fn"]').text
        except:
            name = 'No name'
        print(name)
        print('*********')
        try:
            link_a = rest.find_element_by_xpath('.//a')
            link = link_a.get_attribute('href')
        except:
            link = 'No link available'
        print(link)
        print('**********')
        try:
            rating = rest.find_element_by_xpath('.//span[@class="rating"]').text
            rating = rating[:-2]

        except:
            rating = 'No Ratings Available'
        print(rating)
        print('*********')
        try:
            cuisine = rest.find_element_by_xpath('.//ul[@class="categories summary"]').text
            cuisine = cuisine[4:]
        except:
            cuisine = 'Cuisine Details Not Available'
        print(cuisine)
        print('***********')
        try:
            distance = rest.find_element_by_xpath('.//span[@class="badge-info"]').text
        except:
            distance = "No Distance available"
        print(distance)
        print('***********')
        try:
            tags = rest.find_element_by_xpath('.//div[@class="tag-container"]').text
        except:
            tags = "No special Offers"
        print(tags)
        print('************')
        try:
            cashback = rest.find_element_by_xpath('.//span[@class="vendor-cashback-info"]').text
        except:
            cashback = "No Cashback available"
        print(cashback)

        dict1 = {'Restaurant Name': name, "Rating": rating, "Cuisine": cuisine, "Delivery Time": distance,
                 "Tags": tags, "Cashback": cashback}
        with open(f'Food_Panda_test.csv', 'a+', encoding='utf-8-sig') as f:
            w = csv.DictWriter(f, dict1.keys())
            if not header_added:
                w.writeheader()
                header_added = True
            w.writerow(dict1)


while True:
    p = get_rest()
    get_data(p)

2 个答案:

答案 0 :(得分:0)

我认为一旦您获得餐厅详细信息,就应该删除重复项:

假设您有一个包含重复项的列表 restaurants

像这样删除重复条目:

seen = set(restaurants)
if item not in seen:
    seen.add(item)
    restaurants.append(item) 

如果您有多个列表,则创建一个通用函数并传递列表以完成工作。

更新 1:

执行此操作时 p = get_rest() 则 p 是一个列表。

这样做:

seen = set(p)
    if item not in seen:
        seen.add(item)
        p.append(item) 

现在所有重复项都将被删除。

然后做

 get_data(p)

答案 1 :(得分:0)

我能够使用@cruisepandey 给出的想法解决这个问题。谢谢

old_rest = set()  #Empty Set
while True:
    driver.execute_script("window.scrollBy(0,3825)", "")
    time.sleep(15)
    restaurant_locator = '//ul[@class="vendor-list"]//li[@data-testid and not(@class)]'
    restaurants = driver.find_elements_by_xpath(restaurant_locator) #Find Elements
    ans = set(restaurants) - set(old_rest) #Remove old elements


    for rest in ans:
        driver.execute_script("arguments[0].scrollIntoView();", rest)
        try:
            name = rest.find_element_by_xpath('.//span[@class="name fn"]').text
        except:
            name = 'No name'
        print(name)
        print('*********')
        try:
            link_a = rest.find_element_by_xpath('.//a')
            link = link_a.get_attribute('href')
        except:
            link = 'No link available'
        print(link)
        print('**********')
        try:
            rating = rest.find_element_by_xpath('.//span[@class="rating"]').text
            rating = rating[:-2]

        except:
            rating = 'No Ratings Available'
        print(rating)
        print('*********')
        try:
            cuisine = rest.find_element_by_xpath('.//ul[@class="categories summary"]').text
            cuisine = cuisine[4:]
        except:
            cuisine = 'Cuisine Details Not Available'
        print(cuisine)
        print('***********')
        try:
            distance = rest.find_element_by_xpath('.//span[@class="badge-info"]').text
        except:
            distance = "No Distance available"
        print(distance)
        print('***********')
        try:
            tags = rest.find_element_by_xpath('.//div[@class="tag-container"]').text
        except:
            tags = "No special Offers"
        print(tags)
        print('************')
        try:
            cashback = rest.find_element_by_xpath('.//span[@class="vendor-cashback-info"]').text
        except:
            cashback = "No Cashback available"
        print(cashback)

        dict1 = {'Restaurant Name': name, "URL":link, "Rating": rating, "Cuisine": cuisine, "Delivery Time": distance,
                 "Tags": tags, "Cashback": cashback}
        with open(f'Food_Panda_test.csv', 'a+', encoding='utf-8-sig') as f:
            w = csv.DictWriter(f, dict1.keys())
            if not header_added:
                w.writeheader()
                header_added = True
            w.writerow(dict1)
        old_rest = restaurants #assign scraped items to old_rest