我想从hdfs中的三个不同文件夹中读取三个不同的文件,所以我编写了一个java程序来读取hdfs中的文件,但是在读取三个文件时我发现打开文件太多错误。
我想从folder1读取一个文件,然后检查folder2中的文件,匹配的数据存储在cassandra中,所以我写了下面的程序
我的java程序:
public class hadoopjoin {
public long Dateform(String value) throws Exception
{
Date dte = null;
DateFormat df = new SimpleDateFormat("yyyy-MM-dd");
dte = df.parse(value);
long ut ;
ut =dte.getTime();
return ut;
}
public static void main (String [] args) throws Exception{
Cluster cluster;
Session session;
cluster= Cluster.builder().addContactPoint("localhost").build();
Metadata metadata=cluster.getMetadata();
System.out.println("Cluster Connected"+metadata.getClusterName());
session=cluster.connect();
hadoop s = new hadoop();
Configuration conf=new Configuration();
conf.addResource(new Path("/home/bigdata/hadoop/conf/core-site.xml"));
conf.addResource(new Path("/home/bigdata/hadoop/conf/hdfs-site.xml"));
FileSystem fs=FileSystem.get(conf);
FileSystem fs1=FileSystem.get(conf);
FileStatus[] status=fs.listStatus(new Path("hdfs://localhost:54310/user/bigdata/TAGDETAIL/HADOOPTAGDETAIL"));
FileStatus[] status1=fs1.listStatus(new Path("hdfs://localhost:54310/user/bigdata/COMMENTUSER/HADOOP"));
conf.set("fs.default.name", "hdfs://localhost:54310");
conf.set("mapred.job.tracker", "localhost:54311");
conf.setBoolean("fs.hdfs.impl.disable.cache", true);
session.execute("create table if not exists hadooptagdetailsnew(quesid bigint,accpid bigint,anscount bigint,comcount bigint,viewcount bigint,quescreationdate timestamp,quesyear text,quesmonth text,tags text,quesuserid bigint,quesusername text,quesuserage bigint,queslocation text,quesreputation bigint,quesimage text,anscreationdate timestamp,ansyear text,ansmonth text,ansid bigint,ansuserid bigint,ansname text,ansage bigint,anslocation text,ansreputation bigint,ansimage text,commuserage text,resolution bigint,primary key(quesid,tags,quescreationdate,ansid));");
for(int i=2;i<status.length;i++)
{
BufferedReader br=new BufferedReader(new InputStreamReader(fs.open(status[i].getPath())));
String line;
line=br.readLine();
while(line!=null)
{
if(true)
{
String[] vals=line.split("\t");
long anscreationdate;
String ansyear;
String ansmonth;
long ansid;
long ansuserid;
String ansusername;
String ansimageurl="";
long ansuserage;
String ansuserlocation;
long ansuserreputation;
String comuserage="";
long comuserpostid;
long resolution=0;
long quesid=Long.parseLong(vals[0]);
long accpid;
if( vals[1].isEmpty()||vals[1] == null)
{
accpid= 0;
}
else
{
accpid= Long.parseLong(vals[1]);
}
long anscount ;
if( vals[2].isEmpty()||vals[2] == null)
{
anscount= 0;
}
else
{
anscount= Long.parseLong(vals[2]);
}
long comcount;
if( vals[3].isEmpty()||vals[3] == null)
{
comcount= 0;
}
else
{
comcount= Long.parseLong(vals[3]);
}
long viewcount;
if( vals[4].isEmpty()||vals[4] == null)
{
viewcount= 0;
}
else
{
viewcount= Long.parseLong(vals[4]);
}
long quecreationdate=s.Dateform(vals[5].substring(0, 10));
// String quecreationdate = quescreationdate[0];
String queyear = vals[6];
String quemonth = vals[7];
String tags ="hadoop";
long queuserid;
try{
if(vals[9].isEmpty()||vals[9]==null)
queuserid=0;
else
queuserid = Long.parseLong(vals[9]);
}
catch (IndexOutOfBoundsException e)
{
queuserid=0;
}
String queusername;
try{
if( vals[10].isEmpty()||vals[10] == null)
{
queusername="";
}
else
{
queusername=vals[10];
}}
catch(IndexOutOfBoundsException e)
{
queusername="";
}
long queuserage ;
try{
if( vals[11].isEmpty()||vals[11] == null)
{
queuserage= 0;
}
else
{
queuserage= Long.parseLong(vals[11]);
}
}
catch(IndexOutOfBoundsException e)
{
queuserage=0;
}
String queuserlocation;
try{
if( vals[12].isEmpty()||vals[12] == null)
{
queuserlocation="";
}
else
{
queuserlocation=vals[12];
}
}
catch(IndexOutOfBoundsException e)
{
queuserlocation="";
}
long quesuserreputation;
try{
if( vals[13].isEmpty()||vals[13] == null)
{
quesuserreputation= 0;
}
else
{
quesuserreputation= Long.parseLong(vals[13]);
}
}
catch(IndexOutOfBoundsException e)
{
quesuserreputation=0;
}
String quesimageurl="";
try{
if(vals[14].isEmpty()||vals[14] == null)
{
quesimageurl=" ";
}
else
{
quesimageurl=vals[14];
}
}
catch(IndexOutOfBoundsException e)
{
quesimageurl=" ";
}
try{
if(vals[15].isEmpty()||vals[15] == null)
{
anscreationdate= 0;
//System.out.println(anscreationdate);
}
else
{
anscreationdate= s.Dateform(vals[15].substring(0,10));
// System.out.println(anscreationdate);
}
}
catch(IndexOutOfBoundsException e)
{
anscreationdate=0;
}
if(anscreationdate!=0)
{
if( vals[16].isEmpty()||vals[16] == null)
{
ansyear="";
}
else
{
ansyear=vals[16];
}
if( vals[17].isEmpty()||vals[17] == null)
{
ansmonth="";
}
else
{
ansmonth=vals[17];
}
if( vals[18].isEmpty()||vals[18] == null)
{
ansid= 0;
}
else
{
ansid= Long.parseLong(vals[18]);
}
for(int j=2;j<status1.length;j++)
{
BufferedReader br1=new BufferedReader(new InputStreamReader(fs1.open(status1[j].getPath())));
String line1;
line1=br1.readLine();
while(line1!=null)
{
if(true)
{
String[] vals1=line1.split("\t");
//System.out.println(vals1[0]+":"+vals1[1]);
long comid=Long.parseLong(vals1[0]);
try{
if(comid==ansid||comid==quesid)
{
if(vals1[4]!=null)
comuserage=vals1[4];
comuserage+=",";
// comuserage=vals1[4];
}}
catch(IndexOutOfBoundsException e)
{
comuserage=null;
}
line1=br1.readLine();
}
else
break;
}
}
if(!(comuserage.isEmpty()))
comuserage=comuserage.substring(0, (comuserage.length())-1);
// System.out.println(comuserage);
try{
if( vals[19].isEmpty()||vals[19] == null)
{
ansuserid= 0;
}
else
{
ansuserid= Long.parseLong(vals[19]);
}
}
catch(IndexOutOfBoundsException e)
{
ansuserid=0;
}
try{
if( vals[20].isEmpty()||vals[20] == null)
{
ansusername="";
}
else
{
ansusername=vals[20];
}
}
catch(IndexOutOfBoundsException e)
{
ansusername="";
}
try{
if( vals[21].isEmpty()||vals[21] == null)
{
ansuserage= 0;
}
else
{
ansuserage= Long.parseLong(vals[21]);
}
}
catch(IndexOutOfBoundsException e)
{
ansuserage=0;
}
try{
if( vals[22].isEmpty()||vals[22] == null)
{
ansuserlocation="";
}
else
{
ansuserlocation=vals[22];
}
}
catch(IndexOutOfBoundsException e)
{
ansuserlocation="";
}
try{
if( vals[23].isEmpty()||vals[23] == null)
{
ansuserreputation= 0;
}
else
{
ansuserreputation= Long.parseLong(vals[23]);
}
}
catch(IndexOutOfBoundsException e)
{
ansuserreputation=0;
}
try{
if(vals[24].isEmpty()||vals[24] == null)
{
ansimageurl=" ";
}
else
{
ansimageurl=vals[24];
}
}
catch(IndexOutOfBoundsException e)
{
ansimageurl="";
}
}
else
{
ansyear="";
ansmonth="";
ansid=0;
ansuserid=0;
ansusername="";
ansimageurl="";
ansuserage=0;
ansuserlocation="";
ansuserreputation=0;
}
String[] cols={"quesid","accpid","anscount","comcount","viewcount","quescreationdate","quesyear","quesmonth","tags","quesuserid","quesusername","quesuserage","queslocation","quesreputation","quesimage","anscreationdate","ansyear","ansmonth","ansid","ansuserid","ansname","ansage","anslocation","ansreputation","ansimage","commuserage","resolution"};
Object[] value={quesid,accpid,
anscount,
comcount,
viewcount,
quecreationdate,
queyear,
quemonth,
tags,
queuserid,
queusername,
queuserage,
queuserlocation ,
quesuserreputation,
quesimageurl,
anscreationdate,
ansyear,
ansmonth,
ansid,
ansuserid,
ansusername,
ansuserage,
ansuserlocation,
ansuserreputation,
ansimageurl,
comuserage,
resolution
};
Statement ins=QueryBuilder.insertInto("stackexchange","hadooptagdetailsnew").values(cols,value);
System.out.println(ins+","+comuserage);
session.execute(ins);
line =br.readLine();
}
else{
break;
}
}
}
cluster.shutdown();
}
}
错误:
14/05/15 09:14:11 WARN hdfs.DFSClient: Failed to connect to /127.0.0.1:50010, add to deadNodes and continuejava.net.SocketException: Too many open files
14/05/15 09:14:11 WARN hdfs.DFSClient: DFS Read: java.io.IOException: Could not obtain block: blk_-8822436491089176071_6549 file=/user/bigdata/COMMENTUSER/HADOOP/part-m-00000
at org.apache.hadoop.hdfs.DFSClient$DFSInputStream.chooseDataNode(DFSClient.java:2460)
at org.apache.hadoop.hdfs.DFSClient$DFSInputStream.blockSeekTo(DFSClient.java:2252)
at org.apache.hadoop.hdfs.DFSClient$DFSInputStream.read(DFSClient.java:2415)
at java.io.DataInputStream.read(DataInputStream.java:149)
帮助解决这个问题。