为了通过saveToCassandra()更快地写入Cassandra,我在此之前做了repartitionByCassandraReplica()。我看到的是,分别在每个Cassandra节点上进行写入。我的意思是,在节点1上执行25分钟的写操作。在接下来的25分钟内,在节点2上执行写操作,以此类推。。。这种行为似乎很奇怪,因为我希望Spark同时写入所有节点。 我不确定原因是代码还是集群配置。
我使用:
Spark Job Code(抱歉,它是Java API):
CassandraTableScanJavaRDD<CassandraRow> rdd1 =
javaFunctions(context).cassandraTable("keyspace", "profile_attribute");
JavaRDD<ProfileAttribute> rdd2 = rdd1.mapPartitions(
new FlatMapFunction<Iterator<CassandraRow>, ProfileAttribute>() {
@Override
public Iterator<ProfileAttribute> call(Iterator<CassandraRow> iter) throws Exception {
List<ProfileAttribute> l = new LinkedList<>();
while (iter.hasNext()) {
CassandraRow row = iter.next();
ProfileAttribute att = rowToProfileAttribute(row);
l.add(att);
}
return l.iterator();
}
}, true);
JavaRDD<EntityAlias> rdd3 = rdd2.mapPartitions(
new FlatMapFunction<Iterator<ProfileAttribute>, EntityAlias>() {
@Override
public Iterator<EntityAlias> call(Iterator<ProfileAttribute> iter) throws Exception {
List<EntityAlias> l = new LinkedList<>();
while (iter.hasNext()) {
ProfileAttribute attr = iter.next();
EntityAlias alias = new EntityAlias(attr.getTerm(), attr.getPid(),
attr.getAssociation_type_id(), attr.getCreate_date(),
attr.getPartner_id(), attr.getNumeric_value(), attr.getAttribute_type());
l.add(alias);
}
return l.iterator();
}
}, true);
JavaRDD<EntityAlias> rdd4 =
CassandraJavaUtil.javaFunctions(rdd3).repartitionByCassandraReplica(
"keyspace", "entity_alias", 2000,
CassandraJavaUtil.someColumns(alias_key),
CassandraJavaUtil.mapToRow(EntityAlias.class));
JavaPairRDD<EntityAliasPK, EntityAlias> rdd5 = rdd4.mapPartitionsToPair(
new PairFlatMapFunction<Iterator<EntityAlias>, EntityAliasPK, EntityAlias>() {
@Override
public Iterator<Tuple2<EntityAliasPK, EntityAlias>> call(Iterator<EntityAlias> iter) throws Exception {
List<Tuple2<EntityAliasPK, EntityAlias>> l = new LinkedList<>();
while (iter.hasNext()) {
EntityAlias alias = iter.next();
EntityAliasPK pk = new EntityAliasPK(
alias.getAlias_key(), alias.getEntity_id(), alias.getComposition_key());
Tuple2<EntityAliasPK, EntityAlias> t = new Tuple2<EntityAliasPK, EntityAlias>(pk, alias);
l.add(t);
}
return l.iterator();
}
}, true);
JavaPairRDD<EntityAliasPK, EntityAlias> rdd6 = rdd5.reduceByKey(
new Function2<EntityAlias, EntityAlias, EntityAlias>() {
@Override
public EntityAlias call(EntityAlias e1, EntityAlias e2) throws Exception {
_duplicate.add(1l);
return e1.getDate() >= e2.getDate() ? e1 : e2;
}
});
JavaRDD<EntityAlias> rdd7 = rdd6.values();
RDDAndDStreamCommonJavaFunctions<T>.WriterBuilder wb =
javaFunctions(rdd7).writerBuilder("keyspace", "entity_alias", mapToRow(EntityAlias.class));
WriteConf wc = new WriteConf(
wb.writeConf.batchSize(),
wb.writeConf.batchGroupingBufferSize(),
wb.writeConf.batchGroupingKey(),
wb.writeConf.consistencyLevel(),
wb.writeConf.ifNotExists(),
wb.writeConf.ignoreNulls(),
wb.writeConf.parallelismLevel(),
wb.writeConf.throughputMiBPS(),
TTLOption.constant(5000),
TimestampOption.defaultValue(),
false);
wb.withWriteConf(wc).saveToCassandra();
实体类:
class ProfileAttribute {
private String pid; // partition key
private Integer partner_id; // clustering column
private Integer attribute_key; // clustering column
private Integer attribute_type;
private Integer numeric_value;
private Long create_date;
private String term;
private Integer site_id;
private Integer ttl_seconds;
private Integer uahash;
private Integer association_type_id;
private Long ualong_hash;
private String xpartner_advertiser_id;
private Integer ttl;
private Long writetime;
}
class EntityAlias {
private String alias_key; // partition key
private String entity_id; // clustering column
private long composition_key; // clustering column
private int association_type;
private Long date;
private int partner_id;
private int numeric;
private int attribute_type;
}
class EntityAliasPK {
private String alias_key;
private String entity_id;
private long composition_key;
}