每次100条记录到udf

时间:2018-03-08 03:04:16

标签: java apache-spark bigdata spark-dataframe

我必须将记录传递给调用API的UDF,但是我们想要并行执行它,我们正在使用spark,这就是为什么要开发UDF,这里的问题是UDF只需要记录100条记录一次不超过这个,它不能平行处理100多条记录,所以如何确保一次只有100条记录传递到它请注意我们不想在整个记录上使用count()函数。 / p>

我在这里附加了UDF代码,它是一个泛型UDF,它返回struct.moreover数组,如果我们每次都在batchsize变量中传递100条记录,如果假设有198条记录,那么如果我们不想使用count(我们不会知道它的最后一批产品将是98.所以如何处理那件事。

伙计们...我有一个通用的UDF,其中调用API但在调用它之前首先创建100的批次然后只调用restapi ..所以UDF采用的参数是x1:string,x2:string,batchsize :integer(当前批量大小为100)..所以在UDF中,除非batchsize不是100,否则调用不会发生..并且对于每个记录,它将返回null。 因此,直到第99个记录它将返回。无效,但在第100个记录时,呼叫将发生 [所以,现在问题部分:因为我们正在进行批量调整100并且调用仅在第100个记录进行。因此,在条件下,如果我们假设文件中有198条记录,那么100条记录将获得输出但是,其他98只会返回null,因为它们不会被处理。 所以请大家帮帮忙,UDF一次拿一条记录,但它一直收集到第100条记录。我希望这个清除

public class Standardize_Address extends GenericUDF {

	private static final Logger logger = LoggerFactory.getLogger(Standardize_Address.class);
	private int counter = 0;
	Client client = null;
	private Batch batch = new Batch();

	public Standardize_Address() {

		client = new ClientBuilder().withUrl("https://ss-staging-public.beringmedia.com/street-address").build();
	}

	// StringObjectInspector streeti;
	PrimitiveObjectInspector streeti;
	PrimitiveObjectInspector cityi;
	PrimitiveObjectInspector zipi;
	PrimitiveObjectInspector statei;
	PrimitiveObjectInspector batchsizei;

	private ArrayList ret;

	@Override
	public String getDisplayString(String[] argument) {
		return "My display string";
	}

	@Override
	public ObjectInspector initialize(ObjectInspector[] args) throws UDFArgumentException {
		
		System.out.println("under initialize");

		if (args[0] == null) {
			throw new UDFArgumentTypeException(0, "NO Street is mentioned");
		}
		if (args[1] == null) {
			throw new UDFArgumentTypeException(0, "No Zip is mentioned");
		}
		if (args[2] == null) {
			throw new UDFArgumentTypeException(0, "No city is mentioned");
		}
		if (args[3] == null) {
			throw new UDFArgumentTypeException(0, "No State is mentioned");
		}
		if (args[4] == null) {
			throw new UDFArgumentTypeException(0, "No batch size is mentioned");
		}

		/// streeti =args[0];
		 streeti = (PrimitiveObjectInspector)args[0];
		// this.streetvalue = (StringObjectInspector) streeti;
		 cityi = (PrimitiveObjectInspector)args[1];
		 zipi = (PrimitiveObjectInspector)args[2];
		 statei = (PrimitiveObjectInspector)args[3];
		batchsizei = (PrimitiveObjectInspector)args[4];

		ret = new ArrayList();

		ArrayList structFieldNames = new ArrayList();
		ArrayList structFieldObjectInspectors = new ArrayList();

		structFieldNames.add("Street");
		structFieldNames.add("city");
		structFieldNames.add("zip");
		structFieldNames.add("state");

		structFieldObjectInspectors.add(PrimitiveObjectInspectorFactory.writableStringObjectInspector);
		structFieldObjectInspectors.add(PrimitiveObjectInspectorFactory.writableStringObjectInspector);
		structFieldObjectInspectors.add(PrimitiveObjectInspectorFactory.writableStringObjectInspector);
		structFieldObjectInspectors.add(PrimitiveObjectInspectorFactory.writableStringObjectInspector);

		StructObjectInspector si2 = ObjectInspectorFactory.getStandardStructObjectInspector(structFieldNames,
				structFieldObjectInspectors);

		ListObjectInspector li2;
		li2 = ObjectInspectorFactory.getStandardListObjectInspector(si2);
		return li2;
	}

	@Override
	public Object evaluate(DeferredObject[] args) throws HiveException {

		ret.clear();
		System.out.println("under evaluate");

		// String street1 = streetvalue.getPrimitiveJavaObject(args[0].get());

		Object oin = args[4].get();
		System.out.println("under typecasting");
		int batchsize = (Integer) batchsizei.getPrimitiveJavaObject(oin);
		System.out.println("batchsize");
		Object oin1 = args[0].get();
		String street1 = (String) streeti.getPrimitiveJavaObject(oin1);
		Object oin2 = args[1].get();
		String zip1 = (String) zipi.getPrimitiveJavaObject(oin2);
		Object oin3 = args[2].get();
		String city1 = (String) cityi.getPrimitiveJavaObject(oin3);
		Object oin4 = args[3].get();
		String state1 = (String) statei.getPrimitiveJavaObject(oin4);

		logger.info("address passed, street=" + street1 + ",zip=" + zip1 + ",city=" + city1 + ",state=" + state1);
		counter++;

		try {
			System.out.println("under try");
			Lookup lookup = new Lookup();
			lookup.setStreet(street1);
			lookup.setCity(city1);
			lookup.setState(state1);
			lookup.setZipCode(zip1);
			lookup.setMaxCandidates(1);
			batch.add(lookup);
		} catch (BatchFullException ex) {
			logger.error(ex.getMessage(), ex);
		} catch (Exception e) {
			logger.error(e.getMessage(), e);
		}

		/* batch.add(lookup); */
		if (counter == batchsize) {
			System.out.println("under if");
			try {
				logger.info("batch input street " + batch.get(0).getStreet());
				try {
					client.send(batch);
				} catch (Exception e) {
					logger.error(e.getMessage(), e);
					logger.warn("skipping current batch, continuing with the next batch");
					batch.clear();
					counter = 0;
					return null;
				}

				Vector<Lookup> lookups = batch.getAllLookups();

				for (int i = 0; i < batch.size(); i++) {
					// ListObjectInspector candidates;
					ArrayList<Candidate> candidates = lookups.get(i).getResult();

					if (candidates.isEmpty()) {
						logger.warn("Address " + i + " is invalid.\n");
						continue;
					}

					logger.info("Address " + i + " is valid. (There is at least one candidate)");

					for (Candidate candidate : candidates) {
						final Components components = candidate.getComponents();
						final Metadata metadata = candidate.getMetadata();

						logger.info("\nCandidate " + candidate.getCandidateIndex() + ":");
						logger.info("Delivery line 1: " + candidate.getDeliveryLine1());
						logger.info("Last line:       " + candidate.getLastLine());
						logger.info("ZIP Code:        " + components.getZipCode() + "-" + components.getPlus4Code());
						logger.info("County:          " + metadata.getCountyName());
						logger.info("Latitude:        " + metadata.getLatitude());
						logger.info("Longitude:       " + metadata.getLongitude());
					}

					Object[] e;
					e = new Object[4];

					e[0] = new Text(candidates.get(i).getComponents().getStreetName());
					e[1] = new Text(candidates.get(i).getComponents().getCityName());
					e[2] = new Text(candidates.get(i).getComponents().getZipCode());
					e[3] = new Text(candidates.get(i).getComponents().getState());

					ret.add(e);
				}
				counter = 0;
				batch.clear();
			} catch (Exception e) {
				logger.error(e.getMessage(), e);
			}
			return ret;

		} else {
			return null;
		}

	}

}

0 个答案:

没有答案