我是scrapy和python的新手,我可以下载所有文件,但是我只想下载特定的Type文件“ EX-10”,这样它就可以下载followinh文件。 (Ex-10.1,Ex-10.2至EX-10.99)。
我的代码
import scrapy, os
class legco(scrapy.Spider):
name = "sec_gov"
start_urls = ["https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&SIC=2834&owner=exclude&match=&start=120&count=40&hidefilings=0"]
def parse(self, response):
for link in response.xpath('//table[@summary="Results"]//td[@scope="row"]/a/@href').extract():
absoluteLink = response.urljoin(link)
yield scrapy.Request(url = absoluteLink, callback = self.parse_links)
def parse_links(self, response):
for links in response.xpath('//table[@summary="Results"]//a[@id="documentsbutton"]/@href').extract():
targetLink = response.urljoin(links)
yield scrapy.Request(url = targetLink, callback = self.collecting_file_links)
def collecting_file_links(self, response):
for links in response.xpath('//table[contains(@summary,"Document")]//td[@scope="row"]/a/@href').extract():
if links.endswith(".htm") or links.endswith(".txt"):
baseLink = response.urljoin(links)
yield scrapy.Request(url = baseLink, callback = self.download_files)
def download_files(self, response):
path = response.url.split('/')[-1]
dirf = r"/home/surukam/scrapy/demo/tutorial/tutorial/Downloads3"
if not os.path.exists(dirf):os.makedirs(dirf)
os.chdir(dirf)
with open(path, 'wb') as f:
f.write(response.body)
Scrapy还想检查下一页...(到最后一页),它不能正常工作。
Rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[@value="Next 40"]',)), callback="parse", follow= True),)
# follow next page links
next_page = response.xpath('.//a[@value="Next 40"]/@href').extract()
if next_page:
next_href = next_page[0]
next_page_url = 'https://www.sec.gov/cgi-bin/browse-edgar?company=&match=&CIK=&filenum=&State=&Country=&SIC=2834&owner=exclude&Find=Find+Companies&action=getcompany' + next_href
request = scrapy.Request(url=next_page_url)
yield request
答案 0 :(得分:0)
您需要使用lea esi,str_array
input text, 13;input is a predefined macro
atoi text; is a predefined macro returning into ax
mov [esi],ax
,但是scrapy提供的名称会根据URL的哈希值生成文件名。
如果要使用自定义文件名,则必须这样创建自己的FilesPipeline
:
FilesPipeline
答案 1 :(得分:0)
您的问题似乎已解决。以下脚本应该在每个分页链接之后从该站点获取您所需的文件,并按照您希望的方式下载这些文件。
import scrapy, os
class legco(scrapy.Spider):
name = "sec_gov"
start_urls = ["https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&SIC=2834&owner=exclude&match=&start=120&count=40&hidefilings=0"]
def parse(self, response):
for link in response.xpath('//table[@summary="Results"]//td[@scope="row"]/a/@href').extract():
absoluteLink = response.urljoin(link)
yield scrapy.Request(url = absoluteLink, callback = self.parse_links)
nextpage = response.css("input[value='Next 40']::attr(onclick)")
if nextpage:
tpage = nextpage.extract_first().split("parent.location=")[1].replace("'","")
nlink = response.urljoin(tpage)
yield scrapy.Request(url=nlink, callback = self.parse)
def parse_links(self, response):
for links in response.xpath('//table[@summary="Results"]//a[@id="documentsbutton"]/@href').extract():
targetLink = response.urljoin(links)
yield scrapy.Request(url = targetLink, callback = self.collecting_file_links)
def collecting_file_links(self, response):
for links in response.xpath('//table[contains(@summary,"Document")]//tr[td[starts-with(., "EX-")]]/td/a[contains(@href, ".htm") or contains(@href, ".txt")]/@href').extract():
baseLink = response.urljoin(links)
yield scrapy.Request(url = baseLink, callback = self.download_files)
def download_files(self, response):
path = response.url.split('/')[-1]
dirf = r"/home/surukam/scrapy/demo/tutorial/tutorial/Downloads3"
if not os.path.exists(dirf):os.makedirs(dirf)
os.chdir(dirf)
with open(path, 'wb') as f:
f.write(response.body)