Question

我想在运行时从网页中提取文本，然后使用c＃在asp.net中使用alchemy api，但我不知道如何在c＃中使用此api。我试图找出文本提取器的参数是什么。如果需要，我也可以尝试使用正则表达式来提取网页，但这不是干净的html标签。

    private void Form3_Load(object sender, EventArgs e)
    {

    }
    void GetPosition(Uri url, string searchTerm)
    {

        string raw = "http://www.google.co.in/search?num=39&q={0}&btnG=Search"; string search = string.Format(raw,
        HttpUtility.UrlEncode(searchTerm)); HttpWebRequest request = (HttpWebRequest)WebRequest.Create(search); using (HttpWebResponse
        response = (HttpWebResponse)request.GetResponse())
        {
            using (StreamReader
                reader = new StreamReader(response.GetResponseStream(),
                Encoding.ASCII))
            {
                string html = reader.ReadToEnd();
                //return FindPosition(html, url);
                fillgoogle(html);
            }
        }
    }
    //New Fill Google
    void fillgoogle(string html)
    {
        listBox1.Items.Clear();
        // string pattern = @"h3 class=";
        string pattern = "<h3 class=\"r\"><a href=";


        /*for (int i = 0; i < 10; i++)
         {
             int start;
             int end;
             int pos;
             pos = html.IndexOf(pattern);
             start = html.IndexOf("href=", pos);
             end = html.IndexOf("/", start + 15);
             ListBox1.Items.Add(prepare(html.Substring(start + 6, end - start)));
             html = html.Substring(end);
         }*/


        //  int start;
        int end;
        int pos;
        // string[] strUrl;
        pos = html.IndexOf(pattern);
        string[] Arr = Regex.Split(html, pattern);


        for (int x = 1; x <= Arr.Length - 1; x++)
        {
            //string find = Arr[x].ToString();
            //string RealData=find.Substring
            // listBox1.Items.Add(Arr[x].ToString());

            end = Arr[x].IndexOf("/", 38);
            str1 = Arr[x].Substring(0, end);
            // strUrl = Regex.Split(Arr[1], "&amp;");
            //string n = string.Join("/url?q=", Arr);
            str1 = str1.Replace('"', ' ');
            str1 = str1.Trim();
            str1 = str1.Remove(0, 7).ToString();

            listBox1.Items.Add(str1);
            // ListBox1.Items.Add(Arr[x].Substring(0, end));
            if (x == 10)
            {
                break;
            }
        }

    }
    void finalList()
    {
        listBox2.Items.Clear();
        for (int i = 0; i < listBox1.Items.Count; i++)
        {
            string Link = listBox1.Items[i].ToString();
            if (Link.IndexOf("&") != -1)
            {

                int end = Link.IndexOf("&");
                string real = Link.Substring(0, end);
                listBox2.Items.Add(real);
                //MessageBox.Show(real);
            }
        }
    }
    string prepare(string url)
    {

        string temp;
        int i;
        i = url.IndexOf("//");
        int j;
        j = url.IndexOf("/", i + 3);
        temp = url.Substring(0, j);
        return (temp);
    }

    private static int FindPosition(
        string html, Uri url)
    {// h3 class=\"r\"><a href=\"http://www.godaddy.com/\
        string lookup = "(<h3 class=r><a href=\")(\\*)";
        MatchCollection matches = Regex.Matches(html, lookup);
        for (int i = 0; i < matches.Count; i++)
        {
            string match = matches[i].Groups[2].Value;
            if (match.Contains(url.Host))
                return i + 1;
        } return 0;
    }

    private void button1_Click(object sender, EventArgs e)
    {
        richTextBox1.Text = "";
        Uri url = new Uri("http://www.godaddy.com");
        GetPosition(url, textBox1.Text);
        finalList();
        webPage page = new webPage();

        page.URL = listBox2.Items[0].ToString();

        page.Load();  //Load the text from the specified URL
        label3.Visible = true;
        linkLabel1.Visible = true;
        label3.Text = listBox2.Items[0].ToString();
        //Display the page TITLE on the screen
        //richTextBox1.Text = "Title: " + page.Title + Environment.NewLine + Environment.NewLine;

        //Display a list of INTERNAL links on the screen (to include external links, see below)
        //richTextBox1.Text += "LINKS" + Environment.NewLine + "=====" + Environment.NewLine;
        //foreach (String link in page.LinksArray)
        //{
        //    richTextBox1.Text += link + Environment.NewLine;
        //}

        //Display the BODY TEXT on the screen
        richTextBox1.Text += Environment.NewLine + page.Body;
        //richTextBox1.Text += Environment.NewLine + page.Paragraph;


    }
    public class webPage
    {
        public String URL;
        private String sTitle;
        private String sBody;
        private String sParagraph;
        private ArrayList aList;

        public String Title
        {
            get
            {
                return sTitle;
            }
        }

        public ArrayList LinksArray
        {
            get
            {
                return aList;
            }
        }

        public String Body
        {
            get
            {
                return sBody;
            }
        }
        public String Paragraph
        {
            get
            {
                return sParagraph;
            }
        }

        public void Load()
        {
            try
            {
                WebRequest objRequest = WebRequest.Create(this.URL);
                WebResponse objResponse = objRequest.GetResponse();
                StreamReader oSR = new StreamReader(objResponse.GetResponseStream());
                string strContent = oSR.ReadToEnd();

                this.sTitle = getTitle(strContent);
                this.aList = fetchLinks(strContent, URL);
                this.sBody = fetchText(strContent);
                this.sParagraph = GetFirstParagraph(strContent);

            }
            catch (Exception e)
            {
                MessageBox.Show(e.ToString());
            }
        }

        private String getTitle(String sHTMLContent)
        {
            //Retrieve the title from the HTML code
            return Regex.Match(sHTMLContent, "<title>(?<title>[^<]+)</title>", RegexOptions.IgnoreCase).Groups["title"].ToString();
        }

        private ArrayList fetchLinks(String sHTMLContent, String sURL)
        {
            //Find all the links in the HTML code and put them
            //into an array
            Match mMatch;
            ArrayList aMatch = new ArrayList();

            mMatch = Regex.Match(sHTMLContent, "href\\s*=\\s*(?:\"(?<1>[^\"]*)\"|(?<1>\\S+))", RegexOptions.IgnoreCase);

            while (mMatch.Success)
            {
                String sMatch = processURL(mMatch.Groups[1].ToString(), sURL);

                //Currently, this code only lists INTERNAL URLs.  If you would
                //like to include EXTERNAL URLs as well, comment out the fol-
                //lowing IF statement EXCEPT the "aMatch.Add(sMatch);" line

                if (sMatch.IndexOf(sURL) >= 0 && checkFormat(sMatch))
                {
                    aMatch.Add(sMatch);
                }

                mMatch = mMatch.NextMatch();
            }

            return aMatch;

        }
        static string GetFirstParagraph(string s)
        {
            Match m = Regex.Match(s, @"<p>\s*(.+?)\s*</p>");
            if (m.Success)
            {
                return m.Groups[1].Value;
            }
            else
            {
                return "";
            }
        }

        private String fetchText(String s)
        {
            //Filter out HTML and JavaScript from the page, leaving only body text
            s = Convert.ToString(Regex.Match(s, @"<body.+?</body>", RegexOptions.Singleline | RegexOptions.IgnoreCase));    //strip everything but <BODY>
            s = Regex.Replace(s, "<script[^>]*?>.*?</script>", "", RegexOptions.Singleline | RegexOptions.IgnoreCase);      //strip JavaScript
            s = Regex.Replace(s, "<[^>]*>", "");                                                    //strip HTML tags
            s = Regex.Replace(s, "&(copy|#169);|&(quot|#34);|&(amp|#38);|&(lt|#60);&(gt|#62);|&(nbsp|#160);|&(iexcl|#161);|&(cent|#162);|&(pound|#163);|&middot;", " ");    //strip symbols
            s = s.Replace("\t", " ");                                                               //strip tabs
            s = Regex.Replace(s, "([\r\n])+", " ");                                                 //strip carriage returns
            s = Regex.Replace(s, "\\s\\s+", " ");                                                   //strip white space (must be last)
            return s.Trim();
        }

        private String processURL(String sInput, String sURL)
        {
            sURL = "http://" + Convert.ToString(sURL.Split('/').GetValue(2));

            if (sInput.IndexOf("http://") < 0)
            {
                if (!sInput.StartsWith("/") && !sURL.EndsWith("/"))
                {
                    return sURL + "/" + sInput;
                }
                else
                {
                    if (sInput.StartsWith("/") && sURL.EndsWith("/"))
                    {
                        return sURL.Substring(0, sURL.Length - 1) + sInput;
                    }
                    else
                    {
                        return sURL + sInput;
                    }
                }
            }
            else
            {
                return sInput;
            }
        }

        private bool checkFormat(String sURL)
        {
            //List only pages ending with valid extensions
            String[] validExt = { ".html", ".php", ".asp", ".htm", ".jsp", ".shtml", ".php3", ".aspx", ".pl", ".cfm" };
            sURL = Convert.ToString(sURL.Split('?').GetValue(0));

            foreach (String ext in validExt)
            {
                if (sURL.Substring(sURL.Length - ext.Length, ext.Length).ToLower() == ext) { return true; }
            }

            return false;
        }
    }

    private void linkLabel1_LinkClicked(object sender, LinkLabelLinkClickedEventArgs e)
    {
        System.Diagnostics.Process.Start(label3.Text);
    }
}
}

Answer 1

我没有看到在您的示例中调用Alchemy API的任何尝试，但这是您需要知道的：

Alchemy API默认使用带有XML响应的标准Web Service调用。但是，您可以指定所需的响应（JSON / RDP）。
Here's the start-up documentation，Text Extraction and Requirements和URLGetText Endpoint（但请在文档网站上查找您需要的终端）。

c＃中的alchemy api用于文本提取器

1 个答案: