这里是页面源中的脚本标记,我要从中提取mp4中的字符串:列表使用scrapy。我无法将其加载到json加载器中,也找不到其他方法来执行此操作。无法确定其xpath。
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>RikTak Video Player - Version 1</title>
<script src="https://cdn.radiantmediatechs.com/rmp/5.2.1/js/rmp.min.js"></script>
<style>
body {
margin: 0;
}
</style>
</head>
<body>
<div id="rmpPlayer"></div>
<script>
var bitrates = {
mp4: ['https://mvd8.ddns.me:443/viewm/52/653/52653.mp4?wmsAuthSign=c2VydmVyX3RpbWU9MTAvMjMvMjAxOSA2OjI2OjAzIFBNJmhhc2hfdmFsdWU9ODlyM3FWTlRONldQWGJOT3JWQWJTUT09JnZhbGlkbWludXRlcz02MA==']
};
var schedule = {
preroll: [
'https://googleads.g.doubleclick.net/pagead/ads?ad_type=video_text_image&client=ca-video-pub-1231661633440980&description_url=https%3A%2F%2Fwww.farfeshplus.com&channel=7962520214&videoad_start_delay=0&hl=ar'
],
midroll: [
[600,'https://googleads.g.doubleclick.net/pagead/ads?ad_type=video_text_image&client=ca-video-pub-1231661633440980&description_url=https%3A%2F%2Fwww.farfeshplus.com&channel=7962520214&videoad_start_delay=0&hl=ar'],
[1200,'https://pubads.g.doubleclick.net/gampad/ads?iu=/60345044/Pirsom_Ayoub_LTD_TOP/farfeshplus/farfeshplus_Preroll&description_url=https%3A%2F%2Fwww.farfeshplus.com%2F&env=vp&impl=s&correlator=&tfcd=0&npa=0&gdfp_req=1&output=vast&sz=640x480&unviewed_position_start=1'],
[1800,'https://googleads.g.doubleclick.net/pagead/ads?ad_type=video_text_image&client=ca-video-pub-1231661633440980&description_url=https%3A%2F%2Fwww.farfeshplus.com&channel=7962520214&videoad_start_delay=0&hl=ar']
],
postroll: [
'https://pubads.g.doubleclick.net/gampad/ads?iu=/60345044/Pirsom_Ayoub_LTD_TOP/farfeshplus/farfeshplus_Preroll&description_url=https%3A%2F%2Fwww.farfeshplus.com%2F&env=vp&impl=s&correlator=&tfcd=0&npa=0&gdfp_req=1&output=vast&sz=640x480&unviewed_position_start=1'
]
};
var settings = {
licenseKey: 'Kl8lNHNrNzkyY3M5dj9yb201ZGFzaXMzMGRiMEElXyo=',
bitrates: bitrates,
delayToFade: 3000,
width: 750,
height: 440,
skin: 's4',
poster: 'https://images.farfeshplus.com/videos/lrg/laila_m_29.jpg',
ads: true,
adSchedule: schedule
};
var elementID = 'rmpPlayer';
var rmp = new RadiantMP(elementID);
rmp.init(settings);
</script>
</body>
</html>
指导我一些提取数据的方法
答案 0 :(得分:2)
首先,您应该选择right selector以将脚本标签信息提取为文本。
text = url.xpath('//body/script/text()').get()
然后,您可以使用正则表达式查找所需的内容。
import re
mp4 = re.compile(r"(?<=mp4:\s\[')(.*)'\]")
print(mp4.findall(text)[0])
https://mvd8.ddns.me:443/viewm/88/686/88686.mp4?wmsAuthSign=c2VydmVyX3RpbWU9MTAvMjMvMjAxOSAzOjMwOjE3IFBNJmhhc2hfdmFsdWU9UXgrZ1dHTWxhVGdNM0Iyd3dSeHJBdz09JnZhbGlkbWludXRlcz02MA==
text = """
<script>
var bitrates = {
mp4: ['https://mvd8.ddns.me:443/viewm/88/686/88686.mp4?wmsAuthSign=c2VydmVyX3RpbWU9MTAvMjMvMjAxOSAzOjMwOjE3IFBNJmhhc2hfdmFsdWU9UXgrZ1dHTWxhVGdNM0Iyd3dSeHJBdz09JnZhbGlkbWludXRlcz02MA==']
};
var schedule = {
preroll: [
'https://googleads.g.doubleclick.net/pagead/ads?ad_type=video_text_image&client=ca-video-pub-1231661633440980&description_url=https%3A%2F%2Fwww.farfeshplus.com&channel=7962520214&videoad_start_delay=0&hl=ar'
],
midroll: [
[600,'https://googleads.g.doubleclick.net/pagead/ads?ad_type=video_text_image&client=ca-video-pub-1231661633440980&description_url=https%3A%2F%2Fwww.farfeshplus.com&channel=7962520214&videoad_start_delay=0&hl=ar'],
[1200,'https://pubads.g.doubleclick.net/gampad/ads?iu=/60345044/Pirsom_Ayoub_LTD_TOP/farfeshplus/farfeshplus_Preroll&description_url=https%3A%2F%2Fwww.farfeshplus.com%2F&env=vp&impl=s&correlator=&tfcd=0&npa=0&gdfp_req=1&output=vast&sz=640x480&unviewed_position_start=1'],
[1800,'https://googleads.g.doubleclick.net/pagead/ads?ad_type=video_text_image&client=ca-video-pub-1231661633440980&description_url=https%3A%2F%2Fwww.farfeshplus.com&channel=7962520214&videoad_start_delay=0&hl=ar']
],
postroll: [
'https://pubads.g.doubleclick.net/gampad/ads?iu=/60345044/Pirsom_Ayoub_LTD_TOP/farfeshplus/farfeshplus_Preroll&description_url=https%3A%2F%2Fwww.farfeshplus.com%2F&env=vp&impl=s&correlator=&tfcd=0&npa=0&gdfp_req=1&output=vast&sz=640x480&unviewed_position_start=1'
]
};
var settings = {
licenseKey: 'Kl8lNHNrNzkyY3M5dj9yb201ZGFzaXMzMGRiMEElXyo=',
bitrates: bitrates,
delayToFade: 3000,
width: 750,
height: 440,
skin: 's4',
poster: 'https://images.farfeshplus.com/videos/lrg/laila_m_29.jpg',
ads: true,
adSchedule: schedule
};
var elementID = 'rmpPlayer';
var rmp = new RadiantMP(elementID);
rmp.init(settings);
</script>
"""
答案 1 :(得分:2)
另一种选择是将BeautifulSoup
与regex
一起使用。 regex
部分与 @FlorianBernard 建议的相同。
from bs4 import BeautifulSoup
import re
soup = BeautifulSoup(text, "html.parser")
script = soup.find_all('script')[1]
mp4 = re.compile(r"(?<=mp4:\s\[\')(.*)\'\]")
print(mp4.findall(script.get_text())[0])
输出:
https://mvd8.ddns.me:443/viewm/52/653/52653.mp4?wmsAuthSign=c2VydmVyX3RpbWU9MTAvMjMvMjAxOSA2OjI2OjAzIFBNJmhhc2hfdmFsdWU9ODlyM3FWTlRONldQWGJOT3JWQWJTUT09JnZhbGlkbWludXRlcz02MA==
text
是包含整个html
文档的变量。
text = """
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>RikTak Video Player - Version 1</title>
<script src="https://cdn.radiantmediatechs.com/rmp/5.2.1/js/rmp.min.js"></script>
<style>
body {
margin: 0;
}
</style>
</head>
<body>
<div id="rmpPlayer"></div>
<script>
var bitrates = {
mp4: ['https://mvd8.ddns.me:443/viewm/52/653/52653.mp4?wmsAuthSign=c2VydmVyX3RpbWU9MTAvMjMvMjAxOSA2OjI2OjAzIFBNJmhhc2hfdmFsdWU9ODlyM3FWTlRONldQWGJOT3JWQWJTUT09JnZhbGlkbWludXRlcz02MA==']
};
var schedule = {
preroll: [
'https://googleads.g.doubleclick.net/pagead/ads?ad_type=video_text_image&client=ca-video-pub-1231661633440980&description_url=https%3A%2F%2Fwww.farfeshplus.com&channel=7962520214&videoad_start_delay=0&hl=ar'
],
midroll: [
[600,'https://googleads.g.doubleclick.net/pagead/ads?ad_type=video_text_image&client=ca-video-pub-1231661633440980&description_url=https%3A%2F%2Fwww.farfeshplus.com&channel=7962520214&videoad_start_delay=0&hl=ar'],
[1200,'https://pubads.g.doubleclick.net/gampad/ads?iu=/60345044/Pirsom_Ayoub_LTD_TOP/farfeshplus/farfeshplus_Preroll&description_url=https%3A%2F%2Fwww.farfeshplus.com%2F&env=vp&impl=s&correlator=&tfcd=0&npa=0&gdfp_req=1&output=vast&sz=640x480&unviewed_position_start=1'],
[1800,'https://googleads.g.doubleclick.net/pagead/ads?ad_type=video_text_image&client=ca-video-pub-1231661633440980&description_url=https%3A%2F%2Fwww.farfeshplus.com&channel=7962520214&videoad_start_delay=0&hl=ar']
],
postroll: [
'https://pubads.g.doubleclick.net/gampad/ads?iu=/60345044/Pirsom_Ayoub_LTD_TOP/farfeshplus/farfeshplus_Preroll&description_url=https%3A%2F%2Fwww.farfeshplus.com%2F&env=vp&impl=s&correlator=&tfcd=0&npa=0&gdfp_req=1&output=vast&sz=640x480&unviewed_position_start=1'
]
};
var settings = {
licenseKey: 'Kl8lNHNrNzkyY3M5dj9yb201ZGFzaXMzMGRiMEElXyo=',
bitrates: bitrates,
delayToFade: 3000,
width: 750,
height: 440,
skin: 's4',
poster: 'https://images.farfeshplus.com/videos/lrg/laila_m_29.jpg',
ads: true,
adSchedule: schedule
};
var elementID = 'rmpPlayer';
var rmp = new RadiantMP(elementID);
rmp.init(settings);
</script>
</body>
</html>
"""