我有一个具有这种结构的Pyspark Dataframe:
// Interface
class IClipboard {
public:
virtual cut(QWidget *) = 0;
virtual copy(QWidget *) = 0;
virtual paste(QWidget *) = 0;
};
class Registry {
// all meta class names have unique addresses - they are effectively memoized
static QMap<const char *, IClipboard*> registry;
public:
static void register(const QMetaObject * o, IClipboard * clipboard) {
auto name = o->className();
auto it = registry.find(name);
if (it == registry.end())
registry.insert(name, clipboard);
else
Q_ASSERT(it->value() == clipboard);
}
static IClipboard * for(QWidget * w) {
auto it = registry.find(w->metaObject()->className());
Q_ASSERT(registry.end() != it);
return it->value();
}
static void unregister(const QMetaObject * o) {
registry.remove(o->className());
}
};
template <class W> class ClipboardWidget : public IClipboard {
Q_DISABLE_COPY(ClipboardWidget)
public:
cut(QWidget * w) override { static_cast<W*>(w)->cut(); }
copy(QWidget * w) override { static_cast<W*>(w)->copy(); }
paste(QWidget * w) override { static_cast<W*>(w)->paste(); }
ClipboardWidget() {
Registry::register(&W::staticMetaObject(), this);
}
~ClipboardWidget() {
Registry::unregister(&W::staticMetaObject());
}
};
// Implementation
QMap<const char *, IClipboard*> Registry::registry;
static ClipboardWidget<QTextEdit> w1;
static ClipboardWidget<QLineEdit> w2;
void yourCode() {
//...
Registry::for(widget)->cut(widget);
}
类似于:
root
|-- Id: string (nullable = true)
|-- Q: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- pr: string (nullable = true)
| | |-- qt: double (nullable = true)
我希望将Q数组转换为列(名称pr值qt)。 另外,我想通过合并(添加)相同的列来避免重复列。
+----+--------------------- ... --+
| Id | Q |
+----+---------------------- ... -+
| 001| [ [pr1,1.9], [pr3,2.0]...] |
| 002| [ [pr2,1.0], [pr9,3.9]...] |
| 003| [ [pr2,9.0], ... ] |
...
我该如何进行这种转变? Thakyou提前!! 胡利安。
答案 0 :(得分:7)
您可以结合使用explode
和pivot
:
import pyspark.sql.functions as F
# explode to get "long" format
df=df.withColumn('exploded', F.explode('Q'))
# get the name and the name in separate columns
df=df.withColumn('name', F.col('exploded').getItem(0))
df=df.withColumn('value', F.col('exploded').getItem(1))
# now pivot
df.groupby('Id').pivot('name').agg(F.max('value')).na.fill(0)
答案 1 :(得分:0)
非常有趣的问题。这就是我接触它的方式。
test.csv
001,pr1:0.9,pr3:1.2,pr2:2.0
002,pr3:5.2,pr4:0.99
Pyspark
file = sc.textFile("file:///test2.csv")
//get it in (key,value)
//[(u'001', u'pr1:0.9')...]
//rdd1 = file.map(lambda r: r.replace(",","\t",1)).map(lambda r: r.split("\t")).map(lambda r: (r[0],r[1])).flatMapValues(lambda r: r.split(','))
rdd1 = file.map(lambda r: r.split(",")[0]).map(lambda r: (r[0],r[1])).flatMapValues(lambda r: r.split(','))
//create a DF with 3 columns
//[(u'001', u'pr1', u'0.9')...)]
+---+---+----+
| _1| _2| _3|
+---+---+----+
|001|pr1| 0.9|
|001|pr3| 1.2|
|001|pr2| 2.0|
|002|pr3| 5.2|
|002|pr4|0.99|
+---+---+----+
rdd2 = rdd1.map(lambda r: (r[0],r[1].split(":"))).map(lambda r: (r[0],r[1][0],r[1][1]))
df = rdd2.toDF()
//Perform the magic
df.groupBy("_1").pivot("_2").agg(expr("coalesce(first(_3),0)"))
+---+---+---+---+----+
| _1|pr1|pr2|pr3| pr4|
+---+---+---+---+----+
|001|0.9|2.0|1.2| 0|
|002| 0| 0|5.2|0.99|
+---+---+---+---+----+