import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.RejectedExecutionHandler;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import org.apache.http.HttpResponse;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.config.SocketConfig;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class ThreadScrapResults {
private static final int MYTHREADS = 10000000;
HttpClient client = HttpClientBuilder.create().build();
static Hashtable<String, String> subCodeSubName = null;
static Hashtable<String, String> collCodeCollName = null;
public static void main(String[] args) throws IOException, InterruptedException{
BlockingQueue<Runnable> blockingQueue =
new LinkedBlockingQueue<Runnable>(105);
//ExecutorService executor = Executors.newFixedThreadPool(MYTHREADS);
// RejectedExecutionHandler block = new RejectedExecutionHandler() {
// void rejectedExecution(Runnable r, ThreadPoolExecutor executor) {
// executor.getQueue().put( r );
// }
// };
ThreadPoolExecutor executor = new ThreadPoolExecutor(Integer.MAX_VALUE, Integer.MAX_VALUE, 20, TimeUnit.MILLISECONDS, blockingQueue);
PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();
cm.setDefaultSocketConfig(
SocketConfig.custom().setSoKeepAlive(true)
.setSoReuseAddress(true)
.setSoTimeout(3000)
.build());
Runnable worker = null;
//Generating some register Numbers
for(int year = 11; year <= 13; year++){
for(int i = 1; i <= 350; i++){
//generating 1050 URLs at one shot
StringBuffer regNo = new StringBuffer("1111").append(year).append("111").append(String.format("%03d", i));
String url = "magicUrl" + regNo;
System.out.println(url);
worker = new MyRunnable(url, regNo.toString());
executor.execute(worker);
}
}
executor.shutdown();
//I want to execute all those 1050 Urls at one shot and parse the result //web pages. But this actually gives me only one result.
}
}
class MyRunnable implements Runnable{
private final String url;
private final String registerNumber;
public MyRunnable(String url, String registerNumber) {
// TODO Auto-generated constructor stub
this.url = url;
this.registerNumber = registerNumber;
}
public void run(){
HttpClient client = HttpClientBuilder.create().build();
HttpGet get = new HttpGet(url);
boolean insertOrNot = true;
HttpResponse response = null;
try {
response = client.execute(get);
} catch (ClientProtocolException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
BufferedReader rd = null;
try {
rd = new BufferedReader(
new InputStreamReader(response.getEntity().getContent()));
} catch (IllegalStateException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
//I get the result of each url here.
StringBuffer result = new StringBuffer();
String line = "";
try {
while ((line = rd.readLine()) != null) {
result.append(line);
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
Document resultWebPage = Jsoup.parse(result.toString());
Elements resultForm = resultWebPage.getElementsByTag("strong");
Elements error = resultWebPage.getElementsByTag("b");
if(error.size() == 4){
String inValidRegNo = error.get(3).html();
if(inValidRegNo.startsWith("Sorry")){
//log here
insertOrNot = false;
}
}
System.out.println(resultForm);
Iterator<Element> itr = resultForm.iterator();
int count = 1;
boolean set = true;
List<List<String>> resultDBOject = new ArrayList<List<String>>();
String regNum = null;
String name = null;
String deptName = null;
String collName = null; //TODO : Get collName and deptName from enum.
String key = "Super";
while(itr.hasNext()){
// System.out.println(itr.next().html());
key = itr.next().html();
try {
if(key.equals("<font color=\"#0000cc\" size=\"3\">Subject Code</font>") || key.equals("<font color=\"#0000cc\" size=\"3\">Grade</font>")
|| key.equals("<font color=\"#0000cc\" size=\"3\">Result</font>")){
continue;
}
else if(key.isEmpty()){
// System.out.println("N/A");
}else if(!key.isEmpty()){
if(set){
if(count == 1){
regNum = key;
// System.out.println(regNum);
count++;
}
if(count == 2){
name = itr.next().html();
// System.out.println(name);
count++;
}
if(count == 3){
deptName = itr.next().html();
// System.out.println(deptName);
}
}
if(count == 4 || count == 1){
count = 0;
set = false;
// String temp = itr.next().html();
// Result results = new Result();
// System.out.println(temp);
List<String> resultOfAStudent = new ArrayList<String>();
resultOfAStudent.add( key);
resultOfAStudent.add( itr.next().html());
resultOfAStudent.add(itr.next().html());
// resultOfAStudent.add(results.getSubjName());
resultDBOject.add(resultOfAStudent);
}
}
count++;
// System.out.println(count);
} finally{
}
} //end of while
//insert it in db
if(insertOrNot){
System.out.println("Successfully inserted" + registerNumber);
}
}
}
以下是我要做的事。
我生成1050个网址。 - 在main方法中两个for循环正常工作。 1)运行后我没有看到我的程序终止,但我得到了所有结果。 2)如何在执行500个Url并休眠10秒后让该程序进入休眠状态,然后继续处理下500个Url。
答案 0 :(得分:2)
看看你的循环:
for(int year = 11; year <= 13; year++){
for(int i = 1; i <= 350; i++){
//generating 1050 URLs at one shot
StringBuffer regNo = new StringBuffer("1111").append(year).append("111").append(String.format("%03d", i));
String url = "magicUrl" + regNo;
System.out.println(url);
worker = new MyRunnable(url, regNo.toString());
}
}
您每次循环都会覆盖worker
,所以当您到达executor.execute(worker);
时,工作人员会保留您分配给它的最后一个值,这将是从中创建的可运行的值生成的最后一个网址。
尝试将worker = new MyRunnable(url, regNo.toString());
行替换为executor.execute(new MyRunnable(url, regNo.toString()));
,看看是否能解决问题。