我需要从https://www.astrotheme.com/celestar/horoscope_celebrity_search_by_filters.php获取名人信息 输入:出生时间仅是已知的时间,职业的世界事件除外,在那里我得到了22822名名人的帮助。我可以使用urllib2和bs4来获取首页数据
import re
import urllib2
from bs4 import BeautifulSoup
url = "https://www.astrotheme.com/celestar/horoscope_celebrity_search_by_filters.php"
data = "sexe=M|F&categorie[0]=0|1|2|3|4|5|6|7|8|9|10|11|12&connue=1&pays=-1&tri=0&x=33&y=13"
fp = urllib2.urlopen(url, data)
soup = BeautifulSoup(fp, 'html.parser')
from_div = soup.find_all('div', attrs={'class': 'titreFiche'})
for major in from_div:
name = re.findall(r'portrait">(.*?)<br/>', str(major))
link = re.findall(r'<a href="(.*?)"', str(major))
print name[0], link[0]
在接下来的230页中,我无法获取数据。我曾经将URL更改为等于页面的URL,直到结束,但无法抓取。有什么办法可以从该页面获取剩余数据?
答案 0 :(得分:1)
您需要会话Cookie,请使用requests
轻松保存会话
from bs4 import BeautifulSoup
import requests, re
url = "https://www.astrotheme.com/celestar/horoscope_celebrity_search_by_filters.php"
searchData = {
"sexe": "M|F",
"categorie[0]": "0|1|2|3|4|5|6|7|8|9|10|11|12",
"connue": 1, "pays": -1, "tri": 0, "x": 33, "y": 13
}
session = requests.session()
def doSearch(url, data=None):
if data:
fp = session.post(url, data=data).text
else:
fp = session.get(url).text
soup = BeautifulSoup(fp, 'html.parser')
from_div = soup.find_all('div', attrs={'class': 'titreFiche'})
for major in from_div:
name = re.findall(r'portrait">(.*?)<br/>', str(major))
link = re.findall(r'<a href="(.*?)"', str(major))
print name[0], link[0]
# do Post search in first request
doSearch(url, searchData)
# we have session and we can use Get request for next page
for index in range(2, 4): # get page 2 to 3
print('getting page: %s' % index)
pageurl = '%s?page=%s' % (url, index)
print(pageurl)
doSearch(pageurl)