我有一个文件,其中包含
等数据11/16/2015,"others (phone,health,etc.)",cont'd attempts,"resource,inc.",dg
我想在双引号内删除逗号。
预期结果
11/162015,"others(phone health etc.)",cont'd attempts,"resource inc.",dg
到目前为止,我试过了
Foreach a generate replace ($1,',','');
Foreach a generate regex_extract($1,'[\,]+',1);
但它们都不起作用。
答案 0 :(得分:0)
首先使用 REGULAR EXP 来分隔元组中的字段,然后应用 REPLACE
试试这段代码:
a = load '<path>' as line;
b = foreach a generate FLATTEN(REGEX_EXTRACT_ALL(line,'(.*)[,]["](.*)["][,](.*)[,]["](.*)["][,](.*)'));
c = foreach b generate $0,REPLACE($1,',',' '),$2,REPLACE($3,',',' '),$4;
dump c;
答案 1 :(得分:0)
可以使用UDF实现,UDF可以查看传递的每个元组中的所有字段。
import java.util.HashMap;
import java.util.Iterator;
import java.util.ArrayList;
import java.io.IOException;
import java.lang.Long;
import java.lang.Exception;
import org.apache.pig.EvalFunc;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.BagFactory;
import org.apache.pig.data.DataType;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.backend.executionengine.ExecException;
public class CommaRemove extends EvalFunc<DataBag> {
@Override
public DataBag exec(Tuple input) throws IOException {
if (input == null || input.size() == 0) {
return null;
}
try {
int inputSize = input.size();
Tuple output = TupleFactory.getInstance().newTuple(inputSize);
for( int i = 0; i < inputSize ; i++)
{
output.set(i, input.get(i).replace(',',''));
}
return output;
} catch (Exception e) {
System.err.println("Failed to process input; error - " + e.getMessage());
return null;
}
}
}