我正在抓取这个网站“https://www.indeed.com/jobs?q=data+scientist&start=” 我想获得div标签的类,这被称为“行结果clickcard”。 但是当我废弃以下代码时。结果是['','row','result'] 为什么是这样? 注意:我知道这是“行结果点击卡”,使用F12键并挖掘出来。
import urllib
import bs4
url = 'http://www.indeed.com/jobs?q=data+scientist&l='
source = urllib.request.urlopen(url).read()
bs_tree = bs4.BeautifulSoup(source)
num_pages = int(np.ceil(29024/10.0))
base_url = 'http://www.indeed.com'
job_links = []
for i in range(1): #do range(num_pages) if you want them all
if i%10==0:
print (num_pages-i)
url = 'http://www.indeed.com/jobs?q=data+scientist&start=' + str(i*10)
html_page = urllib.request.urlopen(url).read()
bs_tree = bs4.BeautifulSoup(html_page)
job_link_area = bs_tree.find(id = 'resultsCol')
job_postings = job_link_area.findAll("div")
for jp in job_postings:
if not jp.get('class') is None:
a = jp.get("class")
b = jp.get('id')
print(a)
print(b)