我试图从Udacity的学生数量中删除课程名称,以找出哪些课程最受欢迎。我设法为item创建代码:
import scrapy
class UdacityItem(scrapy.Item):
name=scrapy.Field()
users=scrapy.Field()
和蜘蛛:
import scrapy
from Udacity.items import UdacityItem
import re
class DmozSpider(scrapy.Spider):
name = "UdSpider"
allowed_domains = ["udacity.com"]
start_urls = ["https://www.udacity.com/courses/all"]
def parse(self, response):
sites = response.xpath('//h3/a')
for s in sites:
t=UdacityItem()
#name & url
t['name']=s.xpath('text()').extract()[0].strip()
url=response.urljoin(s.xpath('@href').extract()[0])
#request
req=scrapy.Request(url, callback=self.second)
req.meta['item']=t
#execute
yield req
def second(self,response):
t=response.meta['item']
strong =response.xpath('//strong[@data-course-student-count]/text()').extract()[0]
t['users']=strong
yield t
因此,我得到了课程的名称,但我得到的文字数量却超过了数千名学生。当我在浏览器中打开example website时,我发现有数千个'是基本值,后来(1-2秒)这个文本正在变成一个正确的数字(我想得到)。
以下是我的问题:
提前感谢您的帮助。
答案 0 :(得分:3)
要获得注册计数,您必须模拟针对特定课程ID的https://www.udacity.com/api/summaries
端点的API请求,该ID可以从URL本身中提取 - 例如,它是ud898
https://www.udacity.com/course/javascript-promises--ud898
网址。
完成蜘蛛:
import json
import re
from urllib import quote_plus
import scrapy
class UdacityItem(scrapy.Item):
name = scrapy.Field()
users = scrapy.Field()
class DmozSpider(scrapy.Spider):
name = "UdSpider"
allowed_domains = ["udacity.com"]
start_urls = ["https://www.udacity.com/courses/all"]
def parse(self, response):
sites = response.xpath('//h3/a')
for s in sites:
t = UdacityItem()
# name & url
t['name'] = s.xpath('text()').extract()[0].strip()
url = response.urljoin(s.xpath('@href').extract()[0])
# request
req = scrapy.Request(url, callback=self.second)
req.meta['item'] = t
# execute
yield req
def second(self, response):
queries = [{
"limit": 1,
"model": "CourseStudentsSummary",
"locator": {
"sample_frequency": "daily",
"content_context": [{
"node_key": re.search(r'--(.*?)$', response.url).group(1)
}]
}
}]
yield scrapy.Request(method="GET",
url="https://www.udacity.com/api/summaries?queries=" + quote_plus(json.dumps(queries)),
callback=self.parse_totals)
def parse_totals(self, response):
print(json.loads(response.body[5:].strip())["summaries"]["default"][0]["data"]["total_enrollments"])