我希望你们在健康和研发工作方面做得最好。
import webbrowser
import scrapy
from urllib.request import urlopen
import re
from scrapy.selector import Selector
class QuotesSpider(scrapy.Spider):
name = "forum"
def start_requests(self):
urls = ['https://tribune.com.pk/'], #'https://www.siasat.pk/forum/content.php/', 'http://hamariweb.com/news/', 'https://www.urdupoint.com/pakistan/all-news/']
for url in urls:
website = urlopen(url)
webbrowser.open(website)
print("HELLO WORLD")
html = website.read()
all_links = re.findall('"((http|ftp)s?://.*?)"', html)
for link in all_links:
yield scrapy.Request(url=link, callback=self.parse)
def parse(self, response):
page = response.url.split('/')[-2]
filename = '%s' % page
with open(filename, 'wb') as f:
f.write(response.body)
self.log('Saved file %s' % filename)
我想打开一个网页,该网页包含许多其他链接,我想打开所有这些,并希望Scrapy废弃所有这些网页。请帮帮我。 提前致谢。
答案 0 :(得分:0)
我已尝试使用monsterindia.com并使用scrapy打开页面,该页面包含多个链接。我已经删除了相应链接中的所有数据,我们也可以进行分页。以下代码可能有用。
class MonsterSpider(scrapy.Spider):
name = 'monster'
start_urls = ['http://jobsearch.monsterindia.com/searchresult.html?day=1&jbc=22']
item = BotItem()
count = 1
def parse(self, response):
for href in response.css('h2.seotitle > a::attr(href)'):
url = response.urljoin(href.extract())
yield scrapy.Request(url =url, callback = self.parse_details)
next_page_url = response.css('ul.pager').xpath('//a[contains(text(), "Next")]/@althref').extract_first()
print next_page_url
if next_page_url:
nextpage = response.css('ul.pager').xpath('//a[contains(text(), "Next")]/@onclick').extract_first()
searchresult_num = nextpage.split("'")[1].strip()
next_page_url = "http://jobsearch.monsterindia.com/searchresult.html?day=1&n="+searchresult_num
next_page_url = response.urljoin(next_page_url)
print next_page_url
yield scrapy.Request(url = next_page_url, callback = self.parse)