我有一个使用AJAX滚动分页的网站(滚动时会加载更多内容)。默认情况下,它会显示25个项目,并且能够抓住这些项目。
如何从分页内容中抓取数据?
我正在使用BeautifulSoup和使用cronjob来抓取数据。
我的代码:
r=requests.get(url)
data = r.text
soup = BeautifulSoup(data)
content=soup.find_all('section',{'class':'jrcl'})
for c in content:
try:
links=c.select('a')[1]['href']
web_link=requests.get(links)
print "web",links
except:
links=c.select('a')[0]['href']
web_link=requests.get(links)
print "web",links
content_data=web_link.text
soup_content = BeautifulSoup(content_data)
text=soup_content.find('section',{'class':'jdlc'})
vendor=VendorDetails()
vendor.company=text.select('.fn')[0].text
vendor.source=links
vendor.address=text.select('.jadlt')[0].text
try:
contact=text.select('.tel')[0]['href']
vendor.contact=contact.replace('tel:',' ')
contact2=text.select('.tel')[0]['href']
vendor.contact2=contact2.replace('tel:',' ')
except:
contact=text.select('.tel')[0]['href']
vendor.contact=contact.replace('tel:',' ')
vendor.save()
答案 0 :(得分:2)
我用selenium和phantom js来做这个。我用window.scrollTo来获取整个页面,这对我有用
def handle(self, *args, **options):
driver = webdriver.PhantomJS()
driver.get("http://example.com")
time.sleep(3)
# elem = driver.find_element_by_tag_name("body")
driver.set_window_size(1024, 768)
no_of_pagedowns = 20
while no_of_pagedowns:
# elem.send_keys(Keys.PAGE_DOWN)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(1)
no_of_pagedowns-=1
post_elems = driver.find_elements_by_class_name("jcn")
driver.save_screenshot('testing.png')
for post in post_elems:
###Operations to be done
driver.close()