我从所有类别的site中提取链接。提取链接和处理分页的代码如下:
def parse_data(self,browser):
b = open('link_list.csv', 'a')
a = csv.writer(b, lineterminator='\n')
try:
vacations = browser.find_elements_by_class_name('property_title')
for v in vacations:
vacation_link = v.find_element_by_tag_name('a').get_attribute('href')
a.writerow([vacation_link])
self.pagination(browser)
except:
pass
def pagination(self,browser):
try:
span = browser.find_element_by_css_selector('.paging.pageDisplay')
spantext = browser.find_element_by_css_selector('.paging.pageDisplay').text
#browser.execute_script('$('+str(span)+').next()')
span.find_element_by_xpath('./following-sibling::a[1]')
span.find_element_by_xpath('./following-sibling::a[1]').click()
self.parse_data(browser)
except:
pass
现在,当我运行代码时,一些链接合并为:
http://www.tripadvisor.ca/Attraction_Review-g811253-d1633107-Reviews-Tauro_Tours-Santa_Barbara_de_Samana_Samana_Province_Dominican_http://www.tripadvisor.ca/Attraction_Review-g147290-d2512805-Reviews-Wise_Mountain_Retreat-Puerto_Plata_Puerto_Plata_Province_Dominican_Republic.html`
http://www.tripadvisor.ca/Attraction_Review-g259440-d7263743-Reviewshttp://www.tripadvisor.ca/Attraction_Review-g147288-d2292268-Reviews-Bahia_de_las_Aguilas-Dominican_Republic.html
部分链接已损坏为:
Paraiso-Cabrera_Maria_Trinidad_Sanchez_Province_Dominican_Republic.html