我想在这篇文章中尽可能地彻底,因为这对我来说非常重要,
虽然这个问题非常简单,只有通过阅读这个问题的标题,你才能理解......
问题是:
健康带宽(30mb Vdsl)可用......
如何为单个数据/文件获取多个httpWebRequest
?,
所以每次重新请求,都会下载只有部分数据 然后,当所有实例都完成后,所有部件将连接成一个部件。
代码:
...到目前为止我所做的工作只有每个任务= HttpWebRequest =不同的文件,
所以加速是纯任务并行,而不是使用多个任务/线程加速一次下载
在我的问题中。
请参阅以下代码
下一部分只是关于这个主题的更详细的解释和背景......如果你不介意阅读。
虽然我仍然处于类似的项目(有问题),但
它的方式(见下面的代码..)试图为每个分离的任务(不同的下载/文件)获取尽可能多的不同的数据源。 ...所以加速得到了加速,而每个(任务)不必等待前一个完成,然后才有机会被执行。
我在这个当前受到质疑的问题(在下面的代码中准备好了所有内容)中尝试做的事实上是针对相同数据的相同的网址, 所以这次加速的速度是单任务 - 当前下载。
实现与下面的代码相同的想法,只有这一次让SmartWebClient
定位相同的网址
使用多个实例。
然后(现在只有理论)它会请求部分数据内容, 每个实例都有多个请求。
最后一个问题是我需要“把puzle带回一个和平”......我需要了解的另一个问题......
正如你在这段代码中看到的那样,我还没有开始工作的只是数据解析/处理,我觉得使用htmlAgilityPack
非常容易,所以没问题。
主要条目:
var htmlDictionary = urlsForExtraction.urlsConcrDict();
Parallel.ForEach(
urlList.Values,
new ParallelOptions { MaxDegreeOfParallelism = 20 },
url => Download(url, htmlDictionary)
);
foreach (var pair in htmlDictionary)
{
///Process(pair);
MessageBox.Show(pair.Value);
}
public class urlsForExtraction
{
const string URL_Dollar= "";
const string URL_UpdateUsersTimeOut="";
public ConcurrentDictionary<string, string> urlsConcrDict()
{
//need to find the syntax to extract fileds names so it would be possible to iterate on each instead of specying
ConcurrentDictionary<string, string> retDict = new Dictionary<string,string>();
retDict.TryAdd("URL_Dollar", "Any.Url.com");
retDict.TryAdd("URL_UpdateUserstbl", "http://bing.com");
return retDict;
}
}
/// <summary>
/// second Stage Class consumes the Dictionary of urls for extraction
/// then downloads Each via parallel for each using The Smart WeBClient! (download(); )
/// </summary>
public class InitConcurentHtmDictExtrct
{
private void Download(string url, ConcurrentDictionary<string, string> htmlDictionary)
{
using (var webClient = new SmartWebClient())
{
webClient.Encoding = Encoding.GetEncoding("UTF-8");
webClient.Proxy = null;
htmlDictionary.TryAdd(url, webClient.DownloadString(url));
}
}
private ConcurrentDictionary<string, string> htmlDictionary;
public ConcurrentDictionary<string, string> LoopOnUrlsVia_SmartWC(Dictionary<string, string> urlList)
{
htmlDictionary = new ConcurrentDictionary<string, string>();
Parallel.ForEach(
urlList.Values,
new ParallelOptions { MaxDegreeOfParallelism = 20 },
url => Download(url, htmlDictionary)
);
return htmlDictionary;
}
}
/// <summary>
/// the Extraction Process, done via "HtmlAgility pack"
/// easy usage to collect information within a given html Documnet via referencing elements attributes
/// </summary>
public class Results
{
public struct ExtracionParameters
{
public string FileNameToSave;
public string directoryPath;
public string htmlElementType;
}
public enum Extraction
{
ById, ByClassName, ByElementName
}
public void ExtractHtmlDict( ConcurrentDictionary<string, string> htmlResults, Extract By)
{
// helps with easy elements extraction from the page.
HtmlAttribute htAgPcAttrbs;
HtmlDocument HtmlAgPCDoc = new HtmlDocument();
/// will hold a name+content of each documnet-part that was aventually extracted
/// then from this container the build of the result page will be possible
Dictionary<string, HtmlDocument> dictResults = new Dictionary<string, HtmlDocument>();
foreach (KeyValuePair<string, string> htmlPair in htmlResults)
{
Process(htmlPair);
}
}
private static void Process(KeyValuePair<string, string> pair)
{
// do the html processing
}
}
public class SmartWebClient : WebClient
{
private readonly int maxConcurentConnectionCount;
public SmartWebClient(int maxConcurentConnectionCount = 20)
{
this.Proxy = null;
this.Encoding = Encoding.GetEncoding("UTF-8");
this.maxConcurentConnectionCount = maxConcurentConnectionCount;
}
protected override WebRequest GetWebRequest(Uri address)
{
var httpWebRequest = (HttpWebRequest)base.GetWebRequest(address);
if (httpWebRequest == null)
{
return null;
}
if (maxConcurentConnectionCount != 0)
{
httpWebRequest.ServicePoint.ConnectionLimit = maxConcurentConnectionCount;
}
return httpWebRequest;
}
}
}
这让我可以利用好的带宽, 只有我远离实施的解决方案,我才能真正了解从哪里开始。
答案 0 :(得分:2)
如果服务器支持维基百科调用byte serving,您可以使用AddRange method多路复用文件下载产生多个具有特定Range
标头值的请求。另请参阅{{3} })。最严重的HTTP服务器确实支持字节范围。
下面是一些使用字节范围实现文件并行下载的示例代码:
public static void ParallelDownloadFile(string uri, string filePath, int chunkSize)
{
if (uri == null)
throw new ArgumentNullException("uri");
// determine file size first
long size = GetFileSize(uri);
using (FileStream file = new FileStream(filePath, FileMode.Create, FileAccess.Write, FileShare.Write))
{
file.SetLength(size); // set the length first
object syncObject = new object(); // synchronize file writes
Parallel.ForEach(LongRange(0, 1 + size / chunkSize), (start) =>
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(uri);
request.AddRange(start * chunkSize, start * chunkSize + chunkSize - 1);
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
lock (syncObject)
{
using (Stream stream = response.GetResponseStream())
{
file.Seek(start * chunkSize, SeekOrigin.Begin);
stream.CopyTo(file);
}
}
});
}
}
public static long GetFileSize(string uri)
{
if (uri == null)
throw new ArgumentNullException("uri");
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(uri);
request.Method = "HEAD";
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
return response.ContentLength;
}
private static IEnumerable<long> LongRange(long start, long count)
{
long i = 0;
while (true)
{
if (i >= count)
{
yield break;
}
yield return start + i;
i++;
}
}
样本用法:
private static void TestParallelDownload()
{
string uri = "http://localhost/welcome.png";
string fileName = Path.GetFileName(uri);
ParallelDownloadFile(uri, fileName, 10000);
}
PS:我很想知道做这个并行的事情是否真的更有趣,而不是仅仅使用WebClient.DownloadFile
...也许在慢速网络场景中?