我的程序陷入了一个永无止境的循环,我无法看到它如何进入这个陷阱,或者如何避免它。
它正在解析维基百科数据,我认为它只是关注一个连接的组件。
也许我可以将已经访问过的页面存储在一个集合中,如果页面在该集合中,我将不会再回到它?
This is my project,它很小,只有三个短班。
This是它生成的数据的链接,我把它缩短了,否则它会一直持续下去。
This是一个可笑的小玩具输入,产生了这个烂摊子。
当我问this question时,这是我正在处理的同一个项目。
以下是整个代码。
主要班级:
public static void main(String[] args) throws Exception
{
String name_list_file = "/home/matthias/Workbench/SUTD/nytimes_corpus/NYTimesCorpus/2005/01/02/test/people_test.txt";
String single_name;
try (
// read in the original file, list of names, w/e
InputStream stream_for_name_list_file = new FileInputStream( name_list_file );
InputStreamReader stream_reader = new InputStreamReader( stream_for_name_list_file , Charset.forName("UTF-8"));
BufferedReader line_reader = new BufferedReader( stream_reader );
)
{
while (( single_name = line_reader.readLine() ) != null)
{
//replace this by a URL encoder
//String associated_alias = single_name.replace(' ', '+');
String associated_alias = URLEncoder.encode( single_name , "UTF-8");
String platonic_key = single_name;
System.out.println("now processing: " + platonic_key);
Wikidata_Q_Reader.getQ( platonic_key, associated_alias );
}
}
//print the struc
Wikidata_Q_Reader.print_data();
}
维基百科读者/价值获取者:
static Map<String, HashSet<String> > q_valMap = new HashMap<String, HashSet<String> >();
//public static String[] getQ(String variable_entity) throws Exception
public static void getQ( String platonic_key, String associated_alias ) throws Exception
{
//get the corresponding wikidata page
//check the validity of the URL
String URL_czech = "https://www.wikidata.org/wiki/Special:ItemByTitle?site=en&page=" + associated_alias + "&submit=Search";
URL wikidata_page = new URL(URL_czech);
HttpURLConnection wiki_connection = (HttpURLConnection)wikidata_page.openConnection();
InputStream wikiInputStream = null;
try
{
// try to connect and use the input stream
wiki_connection.connect();
wikiInputStream = wiki_connection.getInputStream();
}
catch(IOException e)
{
// failed, try using the error stream
wikiInputStream = wiki_connection.getErrorStream();
}
BufferedReader wiki_data_pagecontent = new BufferedReader(
new InputStreamReader(
wikiInputStream ));
String line_by_line;
while ((line_by_line = wiki_data_pagecontent.readLine()) != null)
{
// if we can determine it's a disambig page we need to send it off to get all
// the possible senses in which it can be used.
Pattern disambig_pattern = Pattern.compile("<div class=\"wikibase-entitytermsview-heading-description \">Wikipedia disambiguation page</div>");
Matcher disambig_indicator = disambig_pattern.matcher(line_by_line);
if (disambig_indicator.matches())
{
//off to get the different usages
Wikipedia_Disambig_Fetcher.all_possibilities( platonic_key, associated_alias );
}
else
{
//get the Q value off the page by matching
Pattern q_page_pattern = Pattern.compile("<!-- wikibase-toolbar --><span class=\"wikibase-toolbar-container\"><span class=\"wikibase-toolbar-item " +
"wikibase-toolbar \">\\[<span class=\"wikibase-toolbar-item wikibase-toolbar-button wikibase-toolbar-button-edit\"><a " +
"href=\"/wiki/Special:SetSiteLink/(.*?)\">edit</a></span>\\]</span></span>");
Matcher match_Q_component = q_page_pattern.matcher(line_by_line);
if ( match_Q_component.matches() )
{
String Q = match_Q_component.group(1);
// 'Q' should be appended to an array, since each entity can hold multiple
// Q values on that basis of disambig
put_to_hash( platonic_key, Q );
}
}
}
wiki_data_pagecontent.close();
// \\ // ! PRINT IT ! // \\ // \\ // \\ // \\ // \\ // \\
for (Map.Entry<String, HashSet<String> > entry : q_valMap.entrySet())
{
System.out.println(entry.getKey()+" : " + Arrays.deepToString(q_valMap.entrySet().toArray()) );
}
}
// add Q values to their arrayList in the hash map at the index of the appropriate entity
public static HashSet<String> put_to_hash(String key, String value )
{
HashSet<String> valSet;
if (q_valMap.containsKey(key)) {
valSet = q_valMap.get(key);
} else {
valSet = new HashSet<String>();
q_valMap.put(key, valSet);
}
valSet.add(value);
return valSet;
}
// add Q values to their arrayList in the hash map at the index of the appropriate entity
public static void print_data()
{
System.out.println("THIS IS THE FINAL DATA SET!!!");
// \\ // ! PRINT IT ! // \\ // \\ // \\ // \\ // \\ // \\
for (Map.Entry<String, HashSet<String> > entry : q_valMap.entrySet())
{
System.out.println(entry.getKey()+" : " + Arrays.deepToString(q_valMap.entrySet().toArray()) );
}
}
处理消歧页面:
public static void all_possibilities( String platonic_key, String associated_alias ) throws Exception
{
System.out.println("this is a disambig page");
//if it's a disambig page we know we can go right to the wikipedia
//get it's normal wiki disambig page
String URL_czech = "https://en.wikipedia.org/wiki/" + associated_alias;
URL wikidata_page = new URL(URL_czech);
HttpURLConnection wiki_connection = (HttpURLConnection)wikidata_page.openConnection();
InputStream wikiInputStream = null;
try
{
// try to connect and use the input stream
wiki_connection.connect();
wikiInputStream = wiki_connection.getInputStream();
}
catch(IOException e)
{
// failed, try using the error stream
wikiInputStream = wiki_connection.getErrorStream();
}
// parse the input stream using Jsoup
Document docx = Jsoup.parse(wikiInputStream, null, wikidata_page.getProtocol()+"://"+wikidata_page.getHost()+"/");
//this can handle the less structured ones.
Elements linx = docx.select( "p:contains(" + associated_alias + ") ~ ul a:eq(0)" );
for (Element linq : linx)
{
System.out.println(linq.text());
String linq_nospace = URLEncoder.encode( linq.text() , "UTF-8");
Wikidata_Q_Reader.getQ( platonic_key, linq_nospace );
}
}