我有一个包含两列的DataFrame:
struct strProcess {
int nPriority;
int nPid;
};
int main(int argc,char *argv[]) {
// Variable Declarations
int nShmid,i,arraySize,nRpriority,j, nInput;
key_t nKey;
char *ptrshm, *ptrs;
int nSize;
struct strProcess pArray[10];//array of 10 structure
struct strProcess *Array;
//Array = pArray;
nKey = 5678;
FILE *f = fopen("logfile.txt", "w");
if(f == NULL) {
printf("Error opening file!\n");
exit(1);
}
nSize = sizeof(pArray);
//create segment
if((nShmid = shmget(nKey,nSize, IPC_CREAT | 0666)) < 0) {
perror("shmget");
exit(1);
}
else {
perror("shmget");
fprintf(f, "\n shared memory segment created\n");
}
Array = shmat(nShmid, NULL, 0);
perror("shmat");
/** loop to create exaCtly 10 process */
nInput = 10; /** call finval function **/
for(i = 0 ; i < nInput; i++) {
if(fork() == 0) {
srand(getpid());
Array[i].nPid = getpid();
nRpriority = rand()%10 + 1;//putting random no b/w 1 to 10..u can call your function also
Array[i].nPriority = nRpriority;
fprintf(f, "\nprint job created with Pid [%d] and priority number [%d]\n",
Array[i].nPid, Array[i].nPriority);
break;//must to avoid repeating
}
else {
;//parent does nothing
}
}
shmdt(Array);
//fprintf(f,"\n total [%d] processes have been created\n",nInput);
/* call fsortasc(pArray, nInput); */
fclose(f);
}
我想将String值编码为数值。我设法以这种方式做到了:
df =
Col1 Col2
aaa bbb
ccc aaa
问题是import org.apache.spark.ml.feature.{OneHotEncoder, StringIndexer}
val indexer1 = new StringIndexer()
.setInputCol("Col1")
.setOutputCol("Col1Index")
.fit(df)
val indexer2 = new StringIndexer()
.setInputCol("Col2")
.setOutputCol("Col2Index")
.fit(df)
val indexed1 = indexer1.transform(df)
val indexed2 = indexer2.transform(df)
val encoder1 = new OneHotEncoder()
.setInputCol("Col1Index")
.setOutputCol("Col1Vec")
val encoder2 = new OneHotEncoder()
.setInputCol("Col2Index")
.setOutputCol("Col2Vec")
val encoded1 = encoder1.transform(indexed1)
encoded1.show()
val encoded2 = encoder2.transform(indexed2)
encoded2.show()
在两列中以不同方式编码。
如何编码我的DataFrame以便正确编码新的DataFrame,例如:
aaa
答案 0 :(得分:2)
在两列上训练单Indexer
:
val df = Seq(("aaa", "bbb"), ("ccc", "aaa")).toDF("col1", "col2")
val indexer = new StringIndexer().setInputCol("col").fit(
df.select("col1").toDF("col").union(df.select("col2").toDF("col"))
)
并在每列上应用副本
import org.apache.spark.ml.param.ParamMap
val result = Seq("col1", "col2").foldLeft(df){
(df, col) => indexer
.copy(new ParamMap()
.put(indexer.inputCol, col)
.put(indexer.outputCol, s"${col}_idx"))
.transform(df)
}
result.show
// +----+----+--------+--------+
// |col1|col2|col1_idx|col2_idx|
// +----+----+--------+--------+
// | aaa| bbb| 0.0| 1.0|
// | ccc| aaa| 2.0| 0.0|
// +----+----+--------+--------+
答案 1 :(得分:1)
您可以进行自我转换,例如我的pyspark代码。
id,name,sex,ethnicity,hometown,organization,id_card_num,address,mobile_num,phone_num,education
10,Minnie,no,furry,Orlando,"Disney World",200,111 Looney Tunes Way,555-1213,555-1214,CartoonU
20,Mickey,yes,furry,Orlando,"Disney World",201,111 Looney Tunes Way,555-1212,555-1211,CartoonU
sindex_pro = StringIndexer(inputCol='StringCol',outputCol='StringCol_c',stringOrderType="frequencyDesc",handleInvalid="keep").fit(province_df)`
from pyspark.sql.functions import col
from pyspark.ml import Transformer
from pyspark.sql import DataFrame
class SelfSI(Transformer):
def __init__(self, clf,col_name):
super(SelfSI, self).__init__()
self.clf = clf
self.col_name=col_name
def rename_col(self,df,invers=False):
or_name = 'StringCol'
col_name = self.col_name
if invers:
df = df.withColumnRenamed(or_name,col_name)
or_name = col_name + '_c'
col_name = 'StringCol_c'
df = df.withColumnRenamed(col_name,or_name)
return df
def _transform(self, df: DataFrame) -> DataFrame:
df = self.rename_col(df)
df = self.clf.transform(df)
df = self.rename_col(df,invers=True)
return df