C#HTML Agility Pack LoadHtml来自SQL Server数据库的字符串

时间:2013-07-05 11:22:12

标签: c# html .net html-agility-pack

我有一个充满网页的数据库,我想从HTML敏捷包中提取信息。我已经构建了一个函数,当我从富文本框加载文本时,它可以获取我想要的信息。然而,当我从SQL数据库中的字符串加载HTML时,它无法从节点正确地获取所有文本。因此,不要从网页上提供我所需的所有信息。

private static string[] Data(string strWebpage,string strURL, int iID)
        {
            //Declair and load HTML agility pack
            HtmlAgilityPack.HtmlDocument HPD = new HtmlAgilityPack.HtmlDocument();
            HPD.LoadHtml(strWebpage);


            string[] strData = new string[17];//Return string array

            //Get text from html nodes

                HtmlAgilityPack.HtmlNode HDNA = HPD.DocumentNode.SelectSingleNode("//div[@class='product_info']"); //Top product information
                HtmlAgilityPack.HtmlNode HDNB = HPD.DocumentNode.SelectSingleNode("//table[@width='300px']"); //Bottom Product Information
                HtmlAgilityPack.HtmlNode HDNC = HPD.DocumentNode.SelectSingleNode("//h2[@class='name']"); //Product title
                HtmlAgilityPack.HtmlNode HDND = HPD.DocumentNode.SelectSingleNode("//div[@class='product_image']"); //Product URL
                HtmlAgilityPack.HtmlNode HDNE = HPD.DocumentNode.SelectSingleNode("//div[@class='contentwrapper']"); //Product Description
                HtmlAgilityPack.HtmlNodeCollection HDNF = HPD.DocumentNode.SelectNodes("//div[@class='conttopright']//a[@class='uponelevel']"); //Get product category


            //Store temporary data ready to be processed and determined if useful
            List<string> strElimination = new List<string>();

                string[] strBits = TextToNArray(HDNA.InnerText);

                for (int i = 0; i < strBits.Length; i = i + 2)
                {
                    strElimination.Add(strBits[i].Trim() + "\t" + strBits[i + 1].Trim()); //Prepiar data types from field 1
                }


                string [] strBits = TextToNArray(HDNB.InnerText);

                for (int i = 0; i < strBits.Length; i = i + 2)
                {
                    strElimination.Add(strBits[i].Trim() + "\t" + strBits[i + 1].Trim()); //Prepiar data types from field 2
                }

            strData[13] = (HDNC.InnerText.Trim()); //Title
            strData[14] = (HDND.InnerHtml.Replace("\\", "\\\\").Replace("<img id=\"ctl00_ContentPlaceHolder1_ProductImage\" src=\"", "").Replace("\" alt=\"Product Image\" style=\"border-width:0px;\">", "").Trim());
            strData[15] = strURL; //Page source URL
            strData[16] = iID.ToString(); //Raw page id
            strData[8] = ""; //Description start text
            strData[0] = ""; //Product category start text

            //Get product category
            foreach (var vCat in HDNF)
            {
                strData[0] += "-" + vCat.InnerText.ToString();
            }
            strData[0] = strData[0].Trim('-').Trim().Replace("Home-","");

            //Extract the description from the text
            string[] strDescProcess = TextToNArray(HDNE.InnerText);
            for (int i = 0; i < strDescProcess.Length; i++)
            {
                if(strDescProcess[i].Trim() == "Description")
                {
                    i++;
                    while (strDescProcess[i].Trim() != "More Product Details")
                    {
                        strData[8] += strDescProcess[i].Trim(); //Add description as one line
                        i++;
                    }
                }
            }

            //Order Additional information into array
            foreach (string strInfo in strElimination)
            {
                string [] strParts = strInfo.Split('\t');
                switch (strParts[0].Trim().ToLower())
                {
                    case "list price*":
                        double dPrice;
                        //Attempt to turn price into valid double value
                        try
                        {
                           dPrice  =  Convert.ToDouble(strParts[1].Substring(1));
                        }
                        catch
                        {
                            try
                            {
                                dPrice = Convert.ToDouble(strParts[1].Substring(2));
                            }
                            catch
                            {
                                dPrice = 0.0;
                            }
                        }
                        strData[1] = dPrice.ToString();
                        break;
                    case "availability":
                        //Determine if book is availiable
                        if (strParts[1].ToLower() == "available")
                        {
                            strData[2] = "1";
                        }
                        else
                        {
                            strData[2] = "0";
                        }
                        break;
                    case "language":
                        strData[3] = strParts[1];
                        break;
                    case "arrangement":
                        strData[4] = strParts[1];
                        break;
                    case "skill level":
                        strData[5] = strParts[1];
                        break;
                    case "publisher":
                        strData[6] = strParts[1];
                        break;
                    case "catalogue no.":
                        strData[7] = strParts[1];
                        break;
                    case "published on":
                        //Turn the date into a format the database understands (American silly date format yyyy-MM-dd)
                        try
                        {
                            strData[9] = Convert.ToDateTime(strParts[1]).ToString("yyyy-MM-dd");
                        }
                        catch
                        {
                            //Date could not be parsed
                            strData[9] = "0000-00-00";
                        }
                        break;
                    case "format":
                        strData[10] = strParts[1];
                        break;
                    case "pages":
                        strData[11] = strParts[1];
                        break;
                    case "isbn":
                        strData[12] = strParts[1];
                        break;
                }
            }

            //Return data found
            return strData;
        }

0 个答案:

没有答案