我想在运行时从网页中提取文本,然后使用c#在asp.net中使用alchemy api,但我不知道如何在c#中使用此api。我试图找出文本提取器的参数是什么。如果需要,我也可以尝试使用正则表达式来提取网页,但这不是干净的html标签。
private void Form3_Load(object sender, EventArgs e)
{
}
void GetPosition(Uri url, string searchTerm)
{
string raw = "http://www.google.co.in/search?num=39&q={0}&btnG=Search"; string search = string.Format(raw,
HttpUtility.UrlEncode(searchTerm)); HttpWebRequest request = (HttpWebRequest)WebRequest.Create(search); using (HttpWebResponse
response = (HttpWebResponse)request.GetResponse())
{
using (StreamReader
reader = new StreamReader(response.GetResponseStream(),
Encoding.ASCII))
{
string html = reader.ReadToEnd();
//return FindPosition(html, url);
fillgoogle(html);
}
}
}
//New Fill Google
void fillgoogle(string html)
{
listBox1.Items.Clear();
// string pattern = @"h3 class=";
string pattern = "<h3 class=\"r\"><a href=";
/*for (int i = 0; i < 10; i++)
{
int start;
int end;
int pos;
pos = html.IndexOf(pattern);
start = html.IndexOf("href=", pos);
end = html.IndexOf("/", start + 15);
ListBox1.Items.Add(prepare(html.Substring(start + 6, end - start)));
html = html.Substring(end);
}*/
// int start;
int end;
int pos;
// string[] strUrl;
pos = html.IndexOf(pattern);
string[] Arr = Regex.Split(html, pattern);
for (int x = 1; x <= Arr.Length - 1; x++)
{
//string find = Arr[x].ToString();
//string RealData=find.Substring
// listBox1.Items.Add(Arr[x].ToString());
end = Arr[x].IndexOf("/", 38);
str1 = Arr[x].Substring(0, end);
// strUrl = Regex.Split(Arr[1], "&");
//string n = string.Join("/url?q=", Arr);
str1 = str1.Replace('"', ' ');
str1 = str1.Trim();
str1 = str1.Remove(0, 7).ToString();
listBox1.Items.Add(str1);
// ListBox1.Items.Add(Arr[x].Substring(0, end));
if (x == 10)
{
break;
}
}
}
void finalList()
{
listBox2.Items.Clear();
for (int i = 0; i < listBox1.Items.Count; i++)
{
string Link = listBox1.Items[i].ToString();
if (Link.IndexOf("&") != -1)
{
int end = Link.IndexOf("&");
string real = Link.Substring(0, end);
listBox2.Items.Add(real);
//MessageBox.Show(real);
}
}
}
string prepare(string url)
{
string temp;
int i;
i = url.IndexOf("//");
int j;
j = url.IndexOf("/", i + 3);
temp = url.Substring(0, j);
return (temp);
}
private static int FindPosition(
string html, Uri url)
{// h3 class=\"r\"><a href=\"http://www.godaddy.com/\
string lookup = "(<h3 class=r><a href=\")(\\*)";
MatchCollection matches = Regex.Matches(html, lookup);
for (int i = 0; i < matches.Count; i++)
{
string match = matches[i].Groups[2].Value;
if (match.Contains(url.Host))
return i + 1;
} return 0;
}
private void button1_Click(object sender, EventArgs e)
{
richTextBox1.Text = "";
Uri url = new Uri("http://www.godaddy.com");
GetPosition(url, textBox1.Text);
finalList();
webPage page = new webPage();
page.URL = listBox2.Items[0].ToString();
page.Load(); //Load the text from the specified URL
label3.Visible = true;
linkLabel1.Visible = true;
label3.Text = listBox2.Items[0].ToString();
//Display the page TITLE on the screen
//richTextBox1.Text = "Title: " + page.Title + Environment.NewLine + Environment.NewLine;
//Display a list of INTERNAL links on the screen (to include external links, see below)
//richTextBox1.Text += "LINKS" + Environment.NewLine + "=====" + Environment.NewLine;
//foreach (String link in page.LinksArray)
//{
// richTextBox1.Text += link + Environment.NewLine;
//}
//Display the BODY TEXT on the screen
richTextBox1.Text += Environment.NewLine + page.Body;
//richTextBox1.Text += Environment.NewLine + page.Paragraph;
}
public class webPage
{
public String URL;
private String sTitle;
private String sBody;
private String sParagraph;
private ArrayList aList;
public String Title
{
get
{
return sTitle;
}
}
public ArrayList LinksArray
{
get
{
return aList;
}
}
public String Body
{
get
{
return sBody;
}
}
public String Paragraph
{
get
{
return sParagraph;
}
}
public void Load()
{
try
{
WebRequest objRequest = WebRequest.Create(this.URL);
WebResponse objResponse = objRequest.GetResponse();
StreamReader oSR = new StreamReader(objResponse.GetResponseStream());
string strContent = oSR.ReadToEnd();
this.sTitle = getTitle(strContent);
this.aList = fetchLinks(strContent, URL);
this.sBody = fetchText(strContent);
this.sParagraph = GetFirstParagraph(strContent);
}
catch (Exception e)
{
MessageBox.Show(e.ToString());
}
}
private String getTitle(String sHTMLContent)
{
//Retrieve the title from the HTML code
return Regex.Match(sHTMLContent, "<title>(?<title>[^<]+)</title>", RegexOptions.IgnoreCase).Groups["title"].ToString();
}
private ArrayList fetchLinks(String sHTMLContent, String sURL)
{
//Find all the links in the HTML code and put them
//into an array
Match mMatch;
ArrayList aMatch = new ArrayList();
mMatch = Regex.Match(sHTMLContent, "href\\s*=\\s*(?:\"(?<1>[^\"]*)\"|(?<1>\\S+))", RegexOptions.IgnoreCase);
while (mMatch.Success)
{
String sMatch = processURL(mMatch.Groups[1].ToString(), sURL);
//Currently, this code only lists INTERNAL URLs. If you would
//like to include EXTERNAL URLs as well, comment out the fol-
//lowing IF statement EXCEPT the "aMatch.Add(sMatch);" line
if (sMatch.IndexOf(sURL) >= 0 && checkFormat(sMatch))
{
aMatch.Add(sMatch);
}
mMatch = mMatch.NextMatch();
}
return aMatch;
}
static string GetFirstParagraph(string s)
{
Match m = Regex.Match(s, @"<p>\s*(.+?)\s*</p>");
if (m.Success)
{
return m.Groups[1].Value;
}
else
{
return "";
}
}
private String fetchText(String s)
{
//Filter out HTML and JavaScript from the page, leaving only body text
s = Convert.ToString(Regex.Match(s, @"<body.+?</body>", RegexOptions.Singleline | RegexOptions.IgnoreCase)); //strip everything but <BODY>
s = Regex.Replace(s, "<script[^>]*?>.*?</script>", "", RegexOptions.Singleline | RegexOptions.IgnoreCase); //strip JavaScript
s = Regex.Replace(s, "<[^>]*>", ""); //strip HTML tags
s = Regex.Replace(s, "&(copy|#169);|&(quot|#34);|&(amp|#38);|&(lt|#60);&(gt|#62);|&(nbsp|#160);|&(iexcl|#161);|&(cent|#162);|&(pound|#163);|·", " "); //strip symbols
s = s.Replace("\t", " "); //strip tabs
s = Regex.Replace(s, "([\r\n])+", " "); //strip carriage returns
s = Regex.Replace(s, "\\s\\s+", " "); //strip white space (must be last)
return s.Trim();
}
private String processURL(String sInput, String sURL)
{
sURL = "http://" + Convert.ToString(sURL.Split('/').GetValue(2));
if (sInput.IndexOf("http://") < 0)
{
if (!sInput.StartsWith("/") && !sURL.EndsWith("/"))
{
return sURL + "/" + sInput;
}
else
{
if (sInput.StartsWith("/") && sURL.EndsWith("/"))
{
return sURL.Substring(0, sURL.Length - 1) + sInput;
}
else
{
return sURL + sInput;
}
}
}
else
{
return sInput;
}
}
private bool checkFormat(String sURL)
{
//List only pages ending with valid extensions
String[] validExt = { ".html", ".php", ".asp", ".htm", ".jsp", ".shtml", ".php3", ".aspx", ".pl", ".cfm" };
sURL = Convert.ToString(sURL.Split('?').GetValue(0));
foreach (String ext in validExt)
{
if (sURL.Substring(sURL.Length - ext.Length, ext.Length).ToLower() == ext) { return true; }
}
return false;
}
}
private void linkLabel1_LinkClicked(object sender, LinkLabelLinkClickedEventArgs e)
{
System.Diagnostics.Process.Start(label3.Text);
}
}
}
答案 0 :(得分:0)
我没有看到在您的示例中调用Alchemy API的任何尝试,但这是您需要知道的: