我试图一次运行10个并发Web请求,最多。我控制信号量的Web请求数量(参见下面的代码)。但是,在运行时,内存会增长 - 最终它会增长很多,应用程序变得非常慢。
点击.NET Memory Profiler
显示以下5个条目,这似乎是原因(并且在应用程序生命周期内也会增长):
乍一看,我只是认为GC没有进入。但是添加以下代码但仍然看不到任何影响让我有点担心:
while (true)
{
Console.ReadKey();
GC.Collect(GC.MaxGeneration, GCCollectionMode.Forced);
GC.WaitForPendingFinalizers();
GC.Collect(GC.MaxGeneration, GCCollectionMode.Forced);
}
(取自@Jon Skeets在GC.Collect in a loop?中的答案)
完整代码:
static async Task<Page> GetPageAsync(this HttpClient client, long maxContentLength, Uri requestUri)
{
var loadTimeMeasure = Stopwatch.StartNew();
var response = await client.GetAsync(requestUri).ConfigureAwait(false);
if (response.IsSuccessStatusCode)
{
using (response.Content)
{
var location = response.Headers.Location;
var responseUri =
location == null ? requestUri :
location.IsAbsoluteUri ? location :
new Uri(requestUri, location);
var contentType = response.Content.Headers.ContentType.MediaType;
var contentLength = response.Content.Headers.ContentLength.HasValue
? response.Content.Headers.ContentLength.Value : 0;
Func<HtmlNode, long, Page> BuildPage = (htmlNode, loadTime) => new Page
{
RequestUri = requestUri,
ResponseUri = responseUri,
HTML = htmlNode,
LoadTime = loadTime,
StatusCode = response.StatusCode,
Headers = ExtractHeaders(response),
ContentLength = contentLength,
ContentType = contentType.ToString()
};
if (contentLength > maxContentLength || !contentType.Contains("text/html"))
{
var page = BuildPage(null, loadTimeMeasure.ElapsedMilliseconds);
response.Content.Dispose();
return page;
}
var content = await response.Content.ReadAsStringAsync();
var page2 = BuildPage(TryMapToHtmlNode(content), loadTimeMeasure.ElapsedMilliseconds);
return page2;
}
}
else
{
if (response.Content != null)
response.Content.Dispose();
return null;
}
}
static HtmlNode TryMapToHtmlNode(string input)
{
try
{
var doc = new HtmlDocument();
doc.LoadHtml(input);
return doc.DocumentNode;
}
catch
{
return null;
}
}
public static IEnumerable<Uri> ExtractLinks(this HtmlNode node, Uri baseUri)
{
var nodes = node.SelectNodes("//a[@href]");
if (nodes == null || nodes.Count == 0)
return Enumerable.Empty<Uri>();
return nodes
.Select(x => TryParseUri(x.GetAttributeValue("href", ""), baseUri))
.Where(x => x != null && (x.Scheme == "http" || x.Scheme == "https"))
.Select(x =>
{
var fragmentIndex = x.ToString().LastIndexOf("#", StringComparison.Ordinal);
if (fragmentIndex == -1)
return x;
return new Uri(x.ToString().Substring(0, fragmentIndex));
});
}
static Uri TryParseUri(string url, Uri baseUri)
{
Uri newUri;
if (Uri.TryCreate(baseUri, url, out newUri))
return newUri;
return null;
}
static ImmutableDictionary<string, string> ExtractHeaders(HttpResponseMessage response)
{
var headers = response.Headers
.Select(x => Tuple.Create(x.Key, x.Value.First()));
var contentHeaders = response.Content.Headers
.Select(x => Tuple.Create(x.Key, x.Value.First()));
return headers
.Concat(contentHeaders)
.ToImmutableDictionary(x => x.Item1, x => x.Item2);
}
和
static void Main(string[] args)
{
ServicePointManager.DefaultConnectionLimit = 5000;
ServicePointManager.Expect100Continue = false;
var hosts = File.ReadAllLines(@"C:\domains.txt");
var handler = new WebRequestHandler
{
AllowAutoRedirect = false,
UseProxy = false,
Proxy = null
};
var client = new HttpClient
{
Timeout = TimeSpan.FromMinutes(20),
};
var semp = new SemaphoreSlim(10);
foreach (var h in hosts)
{
semp.Wait();
WebExtensions.GetPageAsync(client, 5242880, new Uri(h))
.ContinueWith(_ => semp.Release());
}
while (true)
{
Console.ReadKey();
GC.Collect(GC.MaxGeneration, GCCollectionMode.Forced);
GC.WaitForPendingFinalizers();
GC.Collect(GC.MaxGeneration, GCCollectionMode.Forced);
}
}