我必须将记录传递给调用API的UDF,但是我们想要并行执行它,我们正在使用spark,这就是为什么要开发UDF,这里的问题是UDF只需要记录100条记录一次不超过这个,它不能平行处理100多条记录,所以如何确保一次只有100条记录传递到它请注意我们不想在整个记录上使用count()函数。 / p>
我在这里附加了UDF代码,它是一个泛型UDF,它返回struct.moreover数组,如果我们每次都在batchsize变量中传递100条记录,如果假设有198条记录,那么如果我们不想使用count(我们不会知道它的最后一批产品将是98.所以如何处理那件事。
伙计们...我有一个通用的UDF,其中调用API但在调用它之前首先创建100的批次然后只调用restapi ..所以UDF采用的参数是x1:string,x2:string,batchsize :integer(当前批量大小为100)..所以在UDF中,除非batchsize不是100,否则调用不会发生..并且对于每个记录,它将返回null。 因此,直到第99个记录它将返回。无效,但在第100个记录时,呼叫将发生 [所以,现在问题部分:因为我们正在进行批量调整100并且调用仅在第100个记录进行。因此,在条件下,如果我们假设文件中有198条记录,那么100条记录将获得输出但是,其他98只会返回null,因为它们不会被处理。 所以请大家帮帮忙,UDF一次拿一条记录,但它一直收集到第100条记录。我希望这个清除
public class Standardize_Address extends GenericUDF {
private static final Logger logger = LoggerFactory.getLogger(Standardize_Address.class);
private int counter = 0;
Client client = null;
private Batch batch = new Batch();
public Standardize_Address() {
client = new ClientBuilder().withUrl("https://ss-staging-public.beringmedia.com/street-address").build();
}
// StringObjectInspector streeti;
PrimitiveObjectInspector streeti;
PrimitiveObjectInspector cityi;
PrimitiveObjectInspector zipi;
PrimitiveObjectInspector statei;
PrimitiveObjectInspector batchsizei;
private ArrayList ret;
@Override
public String getDisplayString(String[] argument) {
return "My display string";
}
@Override
public ObjectInspector initialize(ObjectInspector[] args) throws UDFArgumentException {
System.out.println("under initialize");
if (args[0] == null) {
throw new UDFArgumentTypeException(0, "NO Street is mentioned");
}
if (args[1] == null) {
throw new UDFArgumentTypeException(0, "No Zip is mentioned");
}
if (args[2] == null) {
throw new UDFArgumentTypeException(0, "No city is mentioned");
}
if (args[3] == null) {
throw new UDFArgumentTypeException(0, "No State is mentioned");
}
if (args[4] == null) {
throw new UDFArgumentTypeException(0, "No batch size is mentioned");
}
/// streeti =args[0];
streeti = (PrimitiveObjectInspector)args[0];
// this.streetvalue = (StringObjectInspector) streeti;
cityi = (PrimitiveObjectInspector)args[1];
zipi = (PrimitiveObjectInspector)args[2];
statei = (PrimitiveObjectInspector)args[3];
batchsizei = (PrimitiveObjectInspector)args[4];
ret = new ArrayList();
ArrayList structFieldNames = new ArrayList();
ArrayList structFieldObjectInspectors = new ArrayList();
structFieldNames.add("Street");
structFieldNames.add("city");
structFieldNames.add("zip");
structFieldNames.add("state");
structFieldObjectInspectors.add(PrimitiveObjectInspectorFactory.writableStringObjectInspector);
structFieldObjectInspectors.add(PrimitiveObjectInspectorFactory.writableStringObjectInspector);
structFieldObjectInspectors.add(PrimitiveObjectInspectorFactory.writableStringObjectInspector);
structFieldObjectInspectors.add(PrimitiveObjectInspectorFactory.writableStringObjectInspector);
StructObjectInspector si2 = ObjectInspectorFactory.getStandardStructObjectInspector(structFieldNames,
structFieldObjectInspectors);
ListObjectInspector li2;
li2 = ObjectInspectorFactory.getStandardListObjectInspector(si2);
return li2;
}
@Override
public Object evaluate(DeferredObject[] args) throws HiveException {
ret.clear();
System.out.println("under evaluate");
// String street1 = streetvalue.getPrimitiveJavaObject(args[0].get());
Object oin = args[4].get();
System.out.println("under typecasting");
int batchsize = (Integer) batchsizei.getPrimitiveJavaObject(oin);
System.out.println("batchsize");
Object oin1 = args[0].get();
String street1 = (String) streeti.getPrimitiveJavaObject(oin1);
Object oin2 = args[1].get();
String zip1 = (String) zipi.getPrimitiveJavaObject(oin2);
Object oin3 = args[2].get();
String city1 = (String) cityi.getPrimitiveJavaObject(oin3);
Object oin4 = args[3].get();
String state1 = (String) statei.getPrimitiveJavaObject(oin4);
logger.info("address passed, street=" + street1 + ",zip=" + zip1 + ",city=" + city1 + ",state=" + state1);
counter++;
try {
System.out.println("under try");
Lookup lookup = new Lookup();
lookup.setStreet(street1);
lookup.setCity(city1);
lookup.setState(state1);
lookup.setZipCode(zip1);
lookup.setMaxCandidates(1);
batch.add(lookup);
} catch (BatchFullException ex) {
logger.error(ex.getMessage(), ex);
} catch (Exception e) {
logger.error(e.getMessage(), e);
}
/* batch.add(lookup); */
if (counter == batchsize) {
System.out.println("under if");
try {
logger.info("batch input street " + batch.get(0).getStreet());
try {
client.send(batch);
} catch (Exception e) {
logger.error(e.getMessage(), e);
logger.warn("skipping current batch, continuing with the next batch");
batch.clear();
counter = 0;
return null;
}
Vector<Lookup> lookups = batch.getAllLookups();
for (int i = 0; i < batch.size(); i++) {
// ListObjectInspector candidates;
ArrayList<Candidate> candidates = lookups.get(i).getResult();
if (candidates.isEmpty()) {
logger.warn("Address " + i + " is invalid.\n");
continue;
}
logger.info("Address " + i + " is valid. (There is at least one candidate)");
for (Candidate candidate : candidates) {
final Components components = candidate.getComponents();
final Metadata metadata = candidate.getMetadata();
logger.info("\nCandidate " + candidate.getCandidateIndex() + ":");
logger.info("Delivery line 1: " + candidate.getDeliveryLine1());
logger.info("Last line: " + candidate.getLastLine());
logger.info("ZIP Code: " + components.getZipCode() + "-" + components.getPlus4Code());
logger.info("County: " + metadata.getCountyName());
logger.info("Latitude: " + metadata.getLatitude());
logger.info("Longitude: " + metadata.getLongitude());
}
Object[] e;
e = new Object[4];
e[0] = new Text(candidates.get(i).getComponents().getStreetName());
e[1] = new Text(candidates.get(i).getComponents().getCityName());
e[2] = new Text(candidates.get(i).getComponents().getZipCode());
e[3] = new Text(candidates.get(i).getComponents().getState());
ret.add(e);
}
counter = 0;
batch.clear();
} catch (Exception e) {
logger.error(e.getMessage(), e);
}
return ret;
} else {
return null;
}
}
}