我有一个充满网页的数据库,我想从HTML敏捷包中提取信息。我已经构建了一个函数,当我从富文本框加载文本时,它可以获取我想要的信息。然而,当我从SQL数据库中的字符串加载HTML时,它无法从节点正确地获取所有文本。因此,不要从网页上提供我所需的所有信息。
private static string[] Data(string strWebpage,string strURL, int iID)
{
//Declair and load HTML agility pack
HtmlAgilityPack.HtmlDocument HPD = new HtmlAgilityPack.HtmlDocument();
HPD.LoadHtml(strWebpage);
string[] strData = new string[17];//Return string array
//Get text from html nodes
HtmlAgilityPack.HtmlNode HDNA = HPD.DocumentNode.SelectSingleNode("//div[@class='product_info']"); //Top product information
HtmlAgilityPack.HtmlNode HDNB = HPD.DocumentNode.SelectSingleNode("//table[@width='300px']"); //Bottom Product Information
HtmlAgilityPack.HtmlNode HDNC = HPD.DocumentNode.SelectSingleNode("//h2[@class='name']"); //Product title
HtmlAgilityPack.HtmlNode HDND = HPD.DocumentNode.SelectSingleNode("//div[@class='product_image']"); //Product URL
HtmlAgilityPack.HtmlNode HDNE = HPD.DocumentNode.SelectSingleNode("//div[@class='contentwrapper']"); //Product Description
HtmlAgilityPack.HtmlNodeCollection HDNF = HPD.DocumentNode.SelectNodes("//div[@class='conttopright']//a[@class='uponelevel']"); //Get product category
//Store temporary data ready to be processed and determined if useful
List<string> strElimination = new List<string>();
string[] strBits = TextToNArray(HDNA.InnerText);
for (int i = 0; i < strBits.Length; i = i + 2)
{
strElimination.Add(strBits[i].Trim() + "\t" + strBits[i + 1].Trim()); //Prepiar data types from field 1
}
string [] strBits = TextToNArray(HDNB.InnerText);
for (int i = 0; i < strBits.Length; i = i + 2)
{
strElimination.Add(strBits[i].Trim() + "\t" + strBits[i + 1].Trim()); //Prepiar data types from field 2
}
strData[13] = (HDNC.InnerText.Trim()); //Title
strData[14] = (HDND.InnerHtml.Replace("\\", "\\\\").Replace("<img id=\"ctl00_ContentPlaceHolder1_ProductImage\" src=\"", "").Replace("\" alt=\"Product Image\" style=\"border-width:0px;\">", "").Trim());
strData[15] = strURL; //Page source URL
strData[16] = iID.ToString(); //Raw page id
strData[8] = ""; //Description start text
strData[0] = ""; //Product category start text
//Get product category
foreach (var vCat in HDNF)
{
strData[0] += "-" + vCat.InnerText.ToString();
}
strData[0] = strData[0].Trim('-').Trim().Replace("Home-","");
//Extract the description from the text
string[] strDescProcess = TextToNArray(HDNE.InnerText);
for (int i = 0; i < strDescProcess.Length; i++)
{
if(strDescProcess[i].Trim() == "Description")
{
i++;
while (strDescProcess[i].Trim() != "More Product Details")
{
strData[8] += strDescProcess[i].Trim(); //Add description as one line
i++;
}
}
}
//Order Additional information into array
foreach (string strInfo in strElimination)
{
string [] strParts = strInfo.Split('\t');
switch (strParts[0].Trim().ToLower())
{
case "list price*":
double dPrice;
//Attempt to turn price into valid double value
try
{
dPrice = Convert.ToDouble(strParts[1].Substring(1));
}
catch
{
try
{
dPrice = Convert.ToDouble(strParts[1].Substring(2));
}
catch
{
dPrice = 0.0;
}
}
strData[1] = dPrice.ToString();
break;
case "availability":
//Determine if book is availiable
if (strParts[1].ToLower() == "available")
{
strData[2] = "1";
}
else
{
strData[2] = "0";
}
break;
case "language":
strData[3] = strParts[1];
break;
case "arrangement":
strData[4] = strParts[1];
break;
case "skill level":
strData[5] = strParts[1];
break;
case "publisher":
strData[6] = strParts[1];
break;
case "catalogue no.":
strData[7] = strParts[1];
break;
case "published on":
//Turn the date into a format the database understands (American silly date format yyyy-MM-dd)
try
{
strData[9] = Convert.ToDateTime(strParts[1]).ToString("yyyy-MM-dd");
}
catch
{
//Date could not be parsed
strData[9] = "0000-00-00";
}
break;
case "format":
strData[10] = strParts[1];
break;
case "pages":
strData[11] = strParts[1];
break;
case "isbn":
strData[12] = strParts[1];
break;
}
}
//Return data found
return strData;
}