我正在一个学生项目中,负责Dice.Com的网络抓取职位发布,以进行分析。最关键的部分是职位描述,但我无法弄清楚如何访问它。我没有HTML经验,但是很少有C#经验。当您打开网站时,您会看到每个职位发布,然后必须单击职位,它会打开一个新页面以显示所有详细信息。如何访问下一页,以便将详细信息写入控制台。 到目前为止,这很简单。
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Net.Http;
using HtmlAgilityPack;
using System.Text.RegularExpressions;
using OpenQA.Selenium.Chrome;
using OpenQA.Selenium.Support;
using OpenQA.Selenium;
using System.Collections;
using System.Xml.Linq;
using OpenQA.Selenium.Support.UI;
namespace WebScaper
{`enter code here`
class Program
{
static void Main(string[] args)
{
GetHtmlAsync();
Console.ReadLine();
var driver = new ChromeDriver();
}
static async void GetHtmlAsync()
{
var url = "https://www.dice.com/jobs?
q=information+technology&l=arkansas#dice";
var httpclient = new HttpClient();
var html = await httpclient.GetStringAsync(url);
var htmlDocument = new HtmlDocument();
htmlDocument.LoadHtml(html);
var JobsHtml = htmlDocument.DocumentNode.Descendants("div")
.Where(node => node.GetAttributeValue("id", "")
.Equals("search-results-control")).ToList();
var JobsListItems = JobsHtml[0].Descendants("div")
.Where(node => node.GetAttributeValue("class", "")
.Contains("complete-serp-result-div")).ToList();
foreach (var JobListItem in JobsListItems)
{
Console.WriteLine("Company Name:" + " " + JobListItem.Descendants("span")
.Where(node => node.GetAttributeValue("class", "")
.Contains("compName")).FirstOrDefault().InnerText.Trim('\r', '\n', '\t'));
Console.WriteLine("Job Title:" + " " + JobListItem.Descendants("span")
.Where(node => node.GetAttributeValue("itemprop", "")
.Contains("title")).FirstOrDefault().InnerText.Trim('\r', '\n', '\t'));
Console.WriteLine("Job Summary:" + " " + JobListItem.Descendants("span")
.Where(node => node.GetAttributeValue("itemprop", "")
.Contains("description")).FirstOrDefault().InnerText.Trim('\r', '\n', '\t'));
Console.WriteLine("Job Location:" + " " + JobListItem.Descendants("span")
.Where(node => node.GetAttributeValue("class", "")
.Contains("jobLoc")).FirstOrDefault().InnerText.Trim('\r', '\n', '\t'));
}
}
}
}
答案 0 :(得分:0)
您可以仅使用HtmlAgilityPack来完成此操作。 而use XPath,代码变得更加简洁明了。
using System;
using System.Collections.Generic;
using HtmlAgilityPack;
class Program
{
private static string domainUrl = @"https://www.dice.com";
private static HtmlWeb web = new HtmlWeb();
private static List<JobInfo> jobsInfoList = new List<JobInfo>();
static void Main(string[] args)
{
Console.WriteLine("Wait a bit until all pages are downloaded..\n");
JustDoIt();
AndPrintResults();
Console.ReadKey();
}
static void JustDoIt()
{
var url = domainUrl + @"/jobs?q=information+technology&l=arkansas#dice";
var htmlDoc = web.Load(url);
var jobsNodes = htmlDoc.DocumentNode.SelectNodes("//*[@class='complete-serp-result-div']");
foreach (var jobNode in jobsNodes)
{
var jobInfo = new JobInfo
{
Title = jobNode.SelectSingleNode(".//span[@itemprop='title']").InnerText,
CompanyName = jobNode.SelectSingleNode(".//span[@itemprop='name']").InnerText,
Location = jobNode.SelectSingleNode(".//span[@itemprop='addressLocality']").InnerText,
Summary = jobNode.SelectSingleNode(".//span[@itemprop='description']").InnerText,
};
var hrefToDescriptionPage = jobNode.SelectSingleNode(".//a[contains(@id,'position')]").Attributes["href"].Value;
var descriptionPage = web.Load(domainUrl + hrefToDescriptionPage);
jobInfo.Description = descriptionPage.DocumentNode.SelectSingleNode("//*[@id='jobdescSec']").InnerHtml;
jobsInfoList.Add(jobInfo);
}
}
static void AndPrintResults()
{
foreach (var job in jobsInfoList)
{
Console.WriteLine($"Title: {job.Title}");
Console.WriteLine($"CompanyName: {job.CompanyName}");
Console.WriteLine($"Location: {job.Location}");
Console.WriteLine($"Summary: {job.Summary}");
// NOTE!!! I am trimming description up to 1000 symbols here just to keep console clean
var trimmedDescription = job.Description.Length > 1000 ? job.Description.Substring(0, 1000) : job.Description;
Console.WriteLine($"Description:\n {trimmedDescription}");
Console.WriteLine($"==================================================================\n");
}
}
public class JobInfo
{
public string CompanyName { get; set; }
public string Title { get; set; }
public string Summary { get; set; }
public string Location { get; set; }
public string Description { get; set; }
}
}