我编写了一个应用程序,它通过我们自己的属性并废弃数据。为了确保我没有运行相同的URL,我使用MySQL数据库来存储URL,一旦处理就标记它。所有这些都是在一个线程中完成的,如果我只有几千个条目就可以了。但是我需要解析几十万个条目,所以我需要对代码进行更改(我一般是多线程中的新手)。我找到了一个例子,并试图复制样式,但似乎没有用。任何人都知道以下代码的问题是什么?
编辑:对不起并不是要让人们猜测这个问题,但是包括异常在内我是愚蠢的。这是例外 “System.InValidCastException:'指定的强制转换无效。'” 当我开始该过程时,它从数据库收集URL,然后永远不会点击DoWork方法//这将从数据库中获取条目
List<Mappings> items = bot.GetUrlsToProcess(100);
if (items != null)
{
var tokenSource = new CancellationTokenSource();
var token = tokenSource.Token;
Worker.Done = new Worker.DoneDelegate(WorkerDone);
foreach (var item in items)
{
urls.Add(item.Url);
WaitingTasks.Enqueue(new Task(id => new Worker().DoWork((int)id, item.Url, token), item.Url, token));
}
LaunchTasks();
}
static async void LaunchTasks()
{
// keep checking until we're done
while ((WaitingTasks.Count > 0) || (RunningTasks.Count > 0))
{
// launch tasks when there's room
while ((WaitingTasks.Count > 0) && (RunningTasks.Count < MaxRunningTasks))
{
Task task = WaitingTasks.Dequeue();
lock (RunningTasks) RunningTasks.Add((int)task.AsyncState, task);
task.Start();
}
UpdateConsole();
await Task.Delay(300); // wait before checking again
}
UpdateConsole(); // all done
}
static void UpdateConsole()
{
Console.Write(string.Format("\rwaiting: {0,3:##0} running: {1,3:##0} ", WaitingTasks.Count, RunningTasks.Count));
}
static void WorkerDone(int id)
{
lock (RunningTasks) RunningTasks.Remove(id);
}
public class Worker
{
public delegate void DoneDelegate(int taskId);
public static DoneDelegate Done { private get; set; }
public async void DoWork(object id, string url, CancellationToken token)
{
if (token.IsCancellationRequested) return;
Content obj;
try
{
int tries = 0;
bool IsUrlProcessed = true;
DateTime dtStart = DateTime.Now;
string articleDate = string.Empty;
try
{
ScrapeWeb bot = new ScrapeWeb();
SearchApi searchApi = new SearchApi();
SearchHits searchHits = searchApi.Url(url, 5, 0);
if (searchHits.Hits.Count() == 0)
{
obj = await bot.ReturnArticleObject(url);
if (obj.Code != HttpStatusCode.OK)
{
Console.WriteLine(string.Format("\r Status is {0}", obj.Code));
tries = itemfound.UrlMaxTries + 1;
IsUrlProcessed = false;
itemfound.HttpCode = obj.Code;
}
else
{
string title = obj.Title;
string content = obj.Contents;
string description = obj.Description;
Articles article = new Articles();
article.Site = url.GetSite();
article.Content = content;
article.Title = title;
article.Url = url.ToLower();
article.Description = description;
string strThumbNail = HtmlHelper.GetImageUrl(url, obj.RawResponse);
article.Author = HtmlHelper.GetAuthor(url, obj.RawResponse);
if (!string.IsNullOrEmpty(strThumbNail))
{
//This condition needs to be added to remove ?n=<number> from EP thumbnails
if (strThumbNail.Contains("?"))
{
article.ImageUrl = strThumbNail.Substring(0, strThumbNail.IndexOf("?")).Replace("http:", "https:");
}
else
article.ImageUrl = strThumbNail.Replace("http:", "https:");
}
else
{
article.ImageUrl = string.IsNullOrEmpty(strThumbNail) ? article.Url.GetDefaultImageUrls() : strThumbNail.Replace("http:", "https:");
}
articleDate = HtmlHelper.GetPublishDate(url, obj.RawResponse);
if (string.IsNullOrEmpty(articleDate))
article.Pubdate = DateTime.Now;
else
article.Pubdate = DateTime.Parse(articleDate);
var client = new Index(searchApi);
var result = client.Upsert(article);
itemfound.HttpCode = obj.Code;
if (result)
{
itemfound.DateCreated = DateTime.Parse(articleDate);
itemfound.DateModified = DateTime.Parse(articleDate);
UpdateItem(itemfound);
}
else
{
tries = itemfound.UrlMaxTries + 1;
IsUrlProcessed = false;
itemfound.DateCreated = DateTime.Parse(articleDate);
itemfound.DateModified = DateTime.Parse(articleDate) == null ? DateTime.Now : DateTime.Parse(articleDate);
UpdateItem(itemfound, tries, IsUrlProcessed);
}
}
}
else
{
tries = itemfound.UrlMaxTries + 1;
IsUrlProcessed = true;
itemfound.HttpCode = HttpStatusCode.OK;
itemfound.DateCreated = DateTime.Parse(articleDate);
itemfound.DateModified = DateTime.Parse(articleDate) == null ? DateTime.Now : DateTime.Parse(articleDate);
}
}
catch (Exception e)
{
tries = itemfound.UrlMaxTries + 1;
IsUrlProcessed = false;
itemfound.DateCreated = DateTime.Parse(articleDate);
itemfound.DateModified = DateTime.Parse(articleDate) == null ? DateTime.Now : DateTime.Parse(articleDate);
}
finally
{
DateTime dtEnd = DateTime.Now;
Console.WriteLine(string.Format("\r Total time taken to process items is {0}", (dtEnd - dtStart).TotalSeconds));
}
}
catch (Exception e)
{
Console.WriteLine(e);
}
Done((int)id);
}
}
所有这些代码都来自Best multi-thread approach for multiple web requests此链接。有人能告诉我如何运行这种方法吗?
答案 0 :(得分:1)
我认为问题在于您创建任务的方式:
new Task(id => new Worker().DoWork((int)id, item.Url, token), item.Url, token)
此Task
构造函数重载期望Action<object>
委托。这意味着id
将被输入为object
,您需要先将其转换为有用的内容。
<强>参数强>
action
- 输入:
System.Action<Object>
- 代表要在任务中执行的代码的委托。
state
- 输入:
System.Object
- 表示操作使用的数据的对象。
cancellationToken
- 类型:
System.Threading.CancellationToken
- 新任务将遵守的CancellationToken
。
您决定通过致电int
将其投放到(int)id
,但您已将item.Url
作为对象本身传递。我无法100%告诉您Url
的类型,但我不希望Url
- 命名属性属于int
类型。
答案 1 :(得分:0)
基于@MarcinJuraszek所说的我刚回到我的代码并添加了一个int,因为我找不到另一种方法来解决它。这是我做的改变
int i=0
foreach (var item in items)
{
urls.Add(item.Url);
WaitingTasks.Enqueue(new Task(id => new Worker().DoWork((string)id, item.Url, token), item.Url, token));
i++;
}