我正在开发一个网络爬虫,该爬虫将从网站上下载PDF文件。
我之前检查了网站的源代码,发现下载PDF的按钮实际上是对表单的提交输入。相应地,该表格将检索Content-Disposition标头中的文件。
这是一张照片:
我的问题是,如何使用Web请求(或HTML Agility包)获取此文件。我以这种方式尝试过,但听者返回null。
HttpWebResponse response = (HttpWebResponse)req.GetResponse();
string file = response.Headers["Content-Disposition"];
预先感谢
答案 0 :(得分:0)
我已经有了答案,这是我为获取文件所做的事情
response = (HttpWebResponse)request.GetResponse();
stream = response.GetResponseStream();
byte[] retorno = ReadToEnd(stream);
response.Close();
stream.Close();
public static byte[] ReadToEnd(System.IO.Stream stream)
{
long originalPosition = 0;
if (stream.CanSeek)
{
originalPosition = stream.Position;
stream.Position = 0;
}
try
{
byte[] readBuffer = new byte[4096];
int totalBytesRead = 0;
int bytesRead;
while ((bytesRead = stream.Read(readBuffer, totalBytesRead, readBuffer.Length - totalBytesRead)) > 0)
{
totalBytesRead += bytesRead;
if (totalBytesRead == readBuffer.Length)
{
int nextByte = stream.ReadByte();
if (nextByte != -1)
{
byte[] temp = new byte[readBuffer.Length * 2];
Buffer.BlockCopy(readBuffer, 0, temp, 0, readBuffer.Length);
Buffer.SetByte(temp, totalBytesRead, (byte)nextByte);
readBuffer = temp;
totalBytesRead++;
}
}
}
byte[] buffer = readBuffer;
if (readBuffer.Length != totalBytesRead)
{
buffer = new byte[totalBytesRead];
Buffer.BlockCopy(readBuffer, 0, buffer, 0, totalBytesRead);
}
return buffer;
}
finally
{
if (stream.CanSeek)
{
stream.Position = originalPosition;
}
}
}
谢谢