pyspark输入读取数组和结构的架构以读取json

时间:2018-07-19 04:56:38

标签: apache-spark pyspark pyspark-sql

我正尝试将json文件加载到pyspark中,仅使用以下特定列

import { Component, OnInit } from '@angular/core';
import { Hero } from '../hero';
import { NgForm } from '@angular/forms'
import { UserService } from './user.service';

@Component({
    selector: 'app-heroes',
    templateUrl: './heroes.component.html',
    styleUrls: ['./heroes.component.css']
})
export class HeroesComponent implements OnInit {

    hero: Hero = {
        FirstName : 'Racj',
        LastName: 'xlkcj',
        Email: 'rd@gmail.com',
        Password: '103aa56',
        Phone: '90632512',
        CountryId:'1'
    };
constructor(private userService: UserService ) {
    }
 ngOnInit() {
        this.userService.registerUser()
            .subscribe((data: any) => {
                if (data.Succeeded == true) {
                    alert('User registration successful');
                }
                else
                    alert(data.Errors[0]);
            });}    
    OnReset()
    {
        this.hero = {


FirstName: '',
            LastName: '',
            Phone: '',
            Password: '',
            Email: '',
            CountryId:''
            }}}

所以我开始为下面的主模式编写输入读取模式

df = spark.read.json("sample/json/", schema=schema)

我尝试编写直接字符串类型,但无法编写数组和结构类型

 |-- test_name: string (nullable = true)
 |-- test_file: string (nullable = true)
 |-- test_id: string (nullable = true)
 |-- test_type: string (nullable = true)
 |-- test_url: string (nullable = true)
 |-- test_ids: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- value: struct (nullable = true)
 |    |-- ct: long (nullable = true)
 |    |-- dimmingSetting: long (nullable = true)
 |    |-- hue: double (nullable = true)
 |    |-- modeId: string (nullable = true)

如何为

扩展此架构
 schema = StructType([
    StructField('test_name', StringType()),
     StructField('test_file', StringType()),
     StructField('test_id', StringType()),
     StructField('test_type', StringType()),
     StructField('test_url', StringType()),
 ])

1 个答案:

答案 0 :(得分:2)

扩展版本应为

from pyspark.sql.types import StructType, StructField, StringType, ArrayType, LongType, DoubleType
schema = StructType([
    StructField('test_name', StringType(), True),
    StructField('test_file', StringType(), True),
    StructField('test_id', StringType(), True),
    StructField('test_type', StringType(), True),
    StructField('test_url', StringType(), True),
    StructField('test_ids', ArrayType(StringType(), True), True),
    StructField('value', StructType([
        StructField('ct', LongType(), True),
        StructField('dimmingSetting', LongType(), True),
        StructField('hue', DoubleType(), True),
        StructField('modeId', StringType(), True)
        ])
    )
])

我希望答案会有所帮助