我试图通过提供文本文件(附在此消息中)来打印hashmap,该文件应该打印hashmap(url,0),(url,1)..但是我只能从文本中获取3个url ..我我无法得到所有网址和它的索引。有人可以帮助我。
-----------------Code------------------------------------------------
import java.util.List;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.URL;
import java.util.Scanner;
import java.util.regex.Pattern;
import java.util.ArrayList;
import java.util.HashMap;
public class ArticleIndexer{
public static void main(String[] args) throws FileNotFoundException {
File file = new File("src\\article2content.txt");
Scanner scan = new Scanner(file);
scanAndPrint(scan,"guardian");
//scan.close();
}
private static boolean matchServer(String url ,String str)
{
if (url.toLowerCase().contains(str.toLowerCase()) && (url != null)&&(str!=null))
{
return true;
}
else{
return false;
}
}
public static void scanAndPrint(Scanner in) throws FileNotFoundException {
String page = in.useDelimiter(Pattern.compile("\\A")).next();
//System.out.println(page.length());
int beg = -1;
for (int i = 0; i < page.length(); i++)
{
if (i + "<a href=".length() <= page.length() || i + "<a Href=".length() <= page.length() || i + "<a hRef=".length() <= page.length() || i + "<a hrEf=".length() <= page.length() || i + "<a hreF=".length() <= page.length() || i + "<A href=".length() <= page.length())
{
String tag = page.substring(i, i +8);
if (tag.equals("<a href=") || tag.equals("<a Href=") || tag.equals("<a hRef=") || tag.equals("<a hrEf=") || tag.equals("<a hreF=") || tag.equals("<A href="))
beg = i;
}
if (i + "\">".length() <= page.length())
{
String tag = page.substring(i, i + 2);
if ((tag.equals("\">") || tag.equals("\">")) && beg != -1)
{
String tt= page.substring(beg + 9, i);
String[] array=tt.split("\"");
String Url= array[0];
Url=MyURL.parseURL(Url);
System.out.println(Url);
beg = -1;
}
}
}
in.close();
}
public static void scanAndPrint(Scanner in, String str) throws FileNotFoundException {
//HashMap<String , Integer>= new HashMap<>
HashMap<String, Integer> hmap = new HashMap<String, Integer>();
// String page = in.useDelimiter(Pattern.compile("\\A")).next();
// //System.out.println(page.length());
// int beg = -1;
// for (int i = 0; i < page.length(); i++)
// {
// if (i + "<a href=".length() <= page.length() || i + "<a Href=".length() <= page.length() || i + "<a hRef=".length() <= page.length() || i + "<a hrEf=".length() <= page.length() || i + "<a hreF=".length() <= page.length() || i + "<A href=".length() <= page.length())
// {
// String tag = page.substring(i, i +8);
// if (tag.equals("<a href=") || tag.equals("<a Href=") || tag.equals("<a hRef=") || tag.equals("<a hrEf=") || tag.equals("<a hreF=") || tag.equals("<A href="))
// beg = i;
// }
// if (i + "\">".length() <= page.length())
// {
// String tag = page.substring(i, i + 2);
// if ((tag.equals("\">") || tag.equals("\">")) && beg != -1)
// {
// String tt= page.substring(beg + 9, i);
//
// String[] array=tt.split("\"");
// String Url= array[0];
// Url=MyURL.parseURL(Url);
//
// System.out.println(Url);
// if(matchServer(Url,str)==true)
String page = in.useDelimiter(Pattern.compile("\\A")).next();
//System.out.println(page.length());
int beg = -1;
for (int i = 0; i < page.length(); i++)
{
if (i + "<a href=".length() <= page.length() || i + "<a Href=".length() <= page.length() || i + "<a hRef=".length() <= page.length() || i + "<a hrEf=".length() <= page.length() || i + "<a hreF=".length() <= page.length() || i + "<A href=".length() <= page.length())
{
String tag = page.substring(i, i +8);
if (tag.equals("<a href=") || tag.equals("<a Href=") || tag.equals("<a hRef=") || tag.equals("<a hrEf=") || tag.equals("<a hreF=") || tag.equals("<A href="))
beg = i;
}
if (i + "\">".length() <= page.length())
{
String tag = page.substring(i, i + 2);
if ((tag.equals("\">") || tag.equals("\">")) && beg != -1)
{
String tt= page.substring(beg + 9, i);
String[] array=tt.split("\"");
String Url= array[0];
Url=MyURL.parseURL(Url);
if(matchServer(Url,str)==true)
{
//System.out.println(Url);
for(int j = 0; j < Url.length(); j++)
{
hmap.put(Url, j);
for (String s : hmap.keySet())
System.out.println( s+ " " + hmap.get(s));
}
}
beg = -1;
}
}
}
}
}
Thanks.
答案 0 :(得分:0)
也许您可以尝试使用Jsoup库从网站解析?
也许有帮助。
package org.jsoup.examples;
import org.jsoup.Jsoup;
import org.jsoup.helper.Validate;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
/**
* Example program to list links from a URL.
*/
public class ListLinks {
public static void main(String[] args) throws IOException {
Validate.isTrue(args.length == 1, "usage: supply url to fetch");
String url = args[0];
print("Fetching %s...", url);
Document doc = Jsoup.connect(url).get();
Elements links = doc.select("a[href]");
Elements media = doc.select("[src]");
Elements imports = doc.select("link[href]");
print("\nMedia: (%d)", media.size());
for (Element src : media) {
if (src.tagName().equals("img"))
print(" * %s: <%s> %sx%s (%s)",
src.tagName(), src.attr("abs:src"), src.attr("width"), src.attr("height"),
trim(src.attr("alt"), 20));
else
print(" * %s: <%s>", src.tagName(), src.attr("abs:src"));
}
print("\nImports: (%d)", imports.size());
for (Element link : imports) {
print(" * %s <%s> (%s)", link.tagName(),link.attr("abs:href"), link.attr("rel"));
}
print("\nLinks: (%d)", links.size());
for (Element link : links) {
print(" * a: <%s> (%s)", link.attr("abs:href"), trim(link.text(), 35));
}
}
private static void print(String msg, Object... args) {
System.out.println(String.format(msg, args));
}
private static String trim(String s, int width) {
if (s.length() > width)
return s.substring(0, width-1) + ".";
else
return s;
}
}