在python中,我使用libxml
看起来像这样 parser = etree.HTMLParser()
id = 0
nodes = node.findall(r'.//div[@id="flexBox_flex_calendar_mainCal"]//table/tr[@class]')
for x in nodes:
if x.attrib['class'].startswith('calendar'):
item = GetARow(x, id)
newsitems.addRow(item)
id = id + 1
for id in range(0, newsitems.getLength()):
rowDict = newsitems.getRow(id)
if rowDict is not None:
rowItems = QStringList([rowDict['Time'], rowDict['Currency'], rowDict['Impact'], rowDict['Event'], rowDict['Actual'], rowDict['Forecast'], rowDict['Previous']] )
#newsItems[rowDict['Time']].append(rowItems)
newsTable.addrow(rowItems)
我在C#中的代码看起来像我需要提取相同的字段,但我不确定如何做到这一点。 whatNodesToFind
字符串有问题。
using System;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
using System.Linq;
using System.Windows.Forms;
using HtmlAgilityPack;
namespace ConsoleApplication276
{
// a container for a url and a parser Action
public class Link
{
public string link { get; set; }
public Action<string> parser { get; set; }
}
public class Program
{
static string[] monthstrings = new string[] { "", "jan", "feb", "mar", "apr", "may", "jun", "july", "aug", "sep", "oct", "nov", "dec" };
public static string GetDateInFOREXFactoryFormat()
{
var today = System.DateTime.Now;
var dayStr = today.Day.ToString();
var monthStr = monthstrings[today.Month];
var yearStr = today.Year.ToString();
return dayStr + monthStr + '.' + yearStr;
}
// Entry Point of the console app
public static void Main(string[] args)
{
try
{
// download each page and dump the content
// you can add more links here, associate each link with a parser action, as for what data should the parser generate create a property for that in the Link container
var task = MessageLoopWorker.Run(DoWorkAsync, new Link()
{
link = "http://www.forexfactory.com/calendar.php?day=" + GetDateInFOREXFactoryFormat(),
parser = (string html) =>
{
//do what ever you need with hap here
var doc = new HtmlAgilityPack.HtmlDocument();
doc.LoadHtml(html);
string whatNodesToFind = ".//div";
//string whatNodesToFind = "table";
//var someNodes = doc.DocumentNode.SelectSingleNode(whatNodesToFind);
var someNodes = doc.DocumentNode.SelectNodes(whatNodesToFind);
foreach (var node in someNodes)
{
Console.WriteLine(node);
}
}
});
task.Wait();
Console.WriteLine("DoWorkAsync completed.");
}
catch (Exception ex)
{
Console.WriteLine("DoWorkAsync failed: " + ex.Message);
}
Console.WriteLine("Press Enter to exit.");
Console.ReadLine();
}
// navigate WebBrowser to the list of urls in a loop
public static async Task<Link> DoWorkAsync(Link[] args)
{
Console.WriteLine("Start working.");
using (var wb = new WebBrowser())
{
wb.ScriptErrorsSuppressed = true;
TaskCompletionSource<bool> tcs = null;
WebBrowserDocumentCompletedEventHandler documentCompletedHandler = (s, e) =>
tcs.TrySetResult(true);
// navigate to each URL in the list
foreach (var arg in args)
{
tcs = new TaskCompletionSource<bool>();
wb.DocumentCompleted += documentCompletedHandler;
try
{
wb.Navigate(arg.link.ToString());
// await for DocumentCompleted
await tcs.Task;
// after the page loads pass the html to the parser
arg.parser(wb.DocumentText);
}
finally
{
wb.DocumentCompleted -= documentCompletedHandler;
}
// the DOM is ready
Console.WriteLine(arg.link.ToString());
Console.WriteLine(wb.Document.Body.OuterHtml);
}
}
Console.WriteLine("End working.");
return null;
}
}
// a helper class to start the message loop and execute an asynchronous task
public static class MessageLoopWorker
{
public static async Task<Object> Run(Func<Link[], Task<Link>> worker, params Link[] args)
{
var tcs = new TaskCompletionSource<object>();
var thread = new Thread(() =>
{
EventHandler idleHandler = null;
idleHandler = async (s, e) =>
{
// handle Application.Idle just once
Application.Idle -= idleHandler;
// return to the message loop
await Task.Yield();
// and continue asynchronously
// propogate the result or exception
try
{
var result = await worker(args);
tcs.SetResult(result);
}
catch (Exception ex)
{
tcs.SetException(ex);
}
// signal to exit the message loop
// Application.Run will exit at this point
Application.ExitThread();
};
// handle Application.Idle just once
// to make sure we're inside the message loop
// and SynchronizationContext has been correctly installed
Application.Idle += idleHandler;
Application.Run();
});
// set STA model for the new thread
thread.SetApartmentState(ApartmentState.STA);
// start the thread and await for the task
thread.Start();
try
{
return await tcs.Task;
}
finally
{
thread.Join();
}
}
}
}
我尝试了这个,但它不起作用,这意味着它不返回任何节点。然而,我可以看到那些节点使用Google Chrome检查元素:
var findclasses = doc.DocumentNode.Descendants("div").Where(d =>
d.Attributes.Contains("class") && d.Attributes["id"].Value.Contains("flex"));
foreach (var d in findclasses)
{
Console.WriteLine(d);
}
答案 0 :(得分:0)
关于修改1 部分,我建议使用d.GetAttributeValue("id", "")
替换d.Attributes["id"].Value
,因为后者会在当前{{1}的情况下抛出异常} element没有属性d
(在解析从此示例中的URL中检索的HTML页面时确实发生了这种情况):
id
<强> Dotnetfiddle Demo
强>
输出
var link = "http://www.forexfactory.com/calendar.php?day=aug7.2015";
var doc = new HtmlWeb().Load(link);
var findclasses = doc.DocumentNode
.Descendants("div")
.Where(d => d.Attributes.Contains("class")
&&
d.GetAttributeValue("id", "").Contains("flex")
);
foreach (var d in findclasses)
{
Console.WriteLine("{0}, {1}", d.Name, d.GetAttributeValue("id", ""));
}
答案 1 :(得分:0)
答案非常微妙。事实证明,html中缺少一个部分!使用正确的标头实例化“正确”的Webclient:
using (WebClient wb = new WebClient())
{
wb.Headers["User-Agent"] =
"User-Agent" + "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.9.0.3 Gecko/2008092417 Firefox/3.0.3";