所以我试图将Retrosheet.org中的棒球事件文件导入Hadoop。每个游戏条目遵循以下格式,每个文件包含一个季节的游戏条目(例如,这是一个不完整的记录,为了冗余和节省空间而删除了一些项目):
id,BOS192704230
version,1
info,inputprogvers,"version 7RS(19) of 07/07/92"
info,visteam,WS1
start,judgj101,"Joe Judge",0,5,3
start,myerb103,"Buddy Myer",0,6,6
start,blueo102,"Ossie Bluege",0,7,5
play,4,0,myerb103,??,,S/BG
play,4,0,blueo102,??,,CS2(2E4)
play,4,0,blueo102,??,,7/FL
play,4,0,ruelm101,??,,63
play,4,0,crowg102,??,,NP
sub,wests101,"Sam West",0,9,11
play,4,0,wests101,??,,K/C
play,4,1,wannp101,??,,NP
sub,liseh101,"Hod Lisenbee",0,9,1
play,4,1,wannp101,??,,W
play,4,1,rothj101,??,,CS2(26)
play,4,1,rothj101,??,,7/F
play,4,1,tobij101,??,,5/P
play,5,0,rices101,??,,6/P
data,er,crowg102,4
data,er,liseh101,0
data,er,braxg101,1
data,er,marbf101,0
data,er,harrs101,3
我正在进行第一次将其导入Hadoop,并且无法实现正确的自定义InputFileFormat以成功读取此类记录。我一直试图使用正则表达式“id,[AZ] {3} [在每个游戏记录的第一行(由”id“表示,然后是团队,季节,日期和游戏代码)分割文件。 0-9] {9}”。当我输出这个(我正在使用SequenceFile输出,但SequenceFile和常规Text文件输出都返回相同的结果)时,我得到一个空的结果文件。朝着正确方向的任何一点都会非常有帮助。到目前为止,我所获得的代码基于此处的模板:http://dronamk.blogspot.com/2013/03/regex-custom-input-format-for-hadoop.html。我使用的是基本相同的代码,只是编译上面提到的正则表达式而不是包含的表达式。
有问题的课程:
package project.baseball;
import java.io.IOException;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
public class RegexInputFormat extends
InputFormat<LongWritable, TextArrayWritable> {
public Pattern pattern = Pattern.compile("id,[A-Z]{3}[0-9]{9}");
private TextInputFormat textIF = new TextInputFormat();
@Override
public List<InputSplit> getSplits(JobContext context) throws IOException,
InterruptedException {
return textIF.getSplits(context);
}
@Override
public RecordReader<LongWritable, TextArrayWritable> createRecordReader(
InputSplit split, TaskAttemptContext context) throws IOException,
InterruptedException {
RegexRecordReader reader = new RegexRecordReader();
if (pattern == null) {
throw new IllegalStateException(
"No pattern specified - unable to create record reader");
}
reader.setPattern(pattern);
return reader;
}
public static class RegexRecordReader extends
RecordReader<LongWritable, TextArrayWritable> {
private LineRecordReader lineRecordReader = new LineRecordReader();
private Pattern pattern;
TextArrayWritable value = new TextArrayWritable();
public void setPattern(Pattern pattern2) {
pattern = pattern2;
}
@Override
public void initialize(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
lineRecordReader.initialize(split, context);
}
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
while (lineRecordReader.nextKeyValue()) {
Matcher matcher;
matcher = pattern.matcher(lineRecordReader.getCurrentValue()
.toString());
if (matcher.find()) {
int fieldCount;
Text[] fields;
fieldCount = matcher.groupCount();
fields = new Text[fieldCount];
for (int i = 0; i < fieldCount; i++) {
fields[i] = new Text(matcher.group(i + 1));
}
value.setFields(fields);
return true;
}
}
return false;
}
@Override
public LongWritable getCurrentKey() throws IOException,
InterruptedException {
return lineRecordReader.getCurrentKey();
}
@Override
public TextArrayWritable getCurrentValue() throws IOException,
InterruptedException {
return value;
}
@Override
public float getProgress() throws IOException, InterruptedException {
return lineRecordReader.getProgress();
}
@Override
public void close() throws IOException {
lineRecordReader.close();
}
}
}
答案 0 :(得分:1)
你的正则表达式可能会错过上下文,即分割线周围的内容。
请改为尝试:
(.*)(id,([A-Z]{3}[0-9]{9}))(.*)