在此先感谢您的帮助。我有几千行,需要进行归一化,以便可以进行分析。 数据结构-
第1行-{'gender':'Male','国籍':'POL','document_type': 'national_identity_card','date_of_expiry':'2024-07-21', 'issuing_country':'POL'}
第2行-{'gender':'Female','nationality':'LTU','document_type': 'national_identity_card','date_of_expiry':'2023-06-27', 'issuing_country':'LTU'}
第3行{'document_type':'driving_licence','date_of_expiry': '2044-12-14','issuing_country':'GRC'}
第4行{'gender':'Male','document_type':'driving_licence', 'date_of_expiry':'2024-08-05','issuing_country':'GBR'}
我想要 (1)性别为上校名称,男性或女性为价值 (2)国籍为上校名称 (3)Document_type作为列名 (4)到期日为列名 (5)发出国家/地区作为列名
请注意,每个原始元素可能具有或不具有全部元素。
预先感谢您的帮助。
答案 0 :(得分:1)
我们假设您有几千行,如下所示:
{'gender': 'Male', 'nationality': 'POL', 'document_type': 'national_identity_card', 'date_of_expiry': '2024-07-21', 'issuing_country': 'POL'}
{'gender': 'Female', 'nationality': 'LTU', 'document_type': 'national_identity_card', 'date_of_expiry': '2023-06-27', 'issuing_country': 'LTU'}
{'document_type': 'driving_licence', 'date_of_expiry': '2044-12-14', 'issuing_country': 'GRC'}
{'gender': 'Male', 'document_type': 'driving_licence', 'date_of_expiry': '2024-08-05', 'issuing_country': 'GBR'}
这是无效的常规JSON,无效的nsjdon和无效的javascript对象语法。
一种处理方法是通过将单引号替换为双引号将其转换为有效的ndjson:
library(magrittr)
library(ndjson)
library(stringi)
readLines(textConnection("{'gender': 'Male', 'nationality': 'POL', 'document_type': 'national_identity_card', 'date_of_expiry': '2024-07-21', 'issuing_country': 'POL'}
{'gender': 'Female', 'nationality': 'LTU', 'document_type': 'national_identity_card', 'date_of_expiry': '2023-06-27', 'issuing_country': 'LTU'}
{'document_type': 'driving_licence', 'date_of_expiry': '2044-12-14', 'issuing_country': 'GRC'}
{'gender': 'Male', 'document_type': 'driving_licence', 'date_of_expiry': '2024-08-05', 'issuing_country': 'GBR'}")) %>%
stri_replace_all_regex("([^\\\\])'", '$1"') %>% # replace non-escaped single quotes with double quotes
ndjson::flatten("tbl") # turn the character vector of now valid
## # A tibble: 4 x 5
## date_of_expiry document_type gender issuing_country nationality
## <chr> <chr> <chr> <chr> <chr>
## 1 2024-07-21 national_identity_ca… Male POL POL
## 2 2023-06-27 national_identity_ca… Female LTU LTU
## 3 2044-12-14 driving_licence NA GRC NA
## 4 2024-08-05 driving_licence Male GBR NA
另一种方法是使用V8
包,因为在javascript领域中,我们可以使用eval()
,无论出于何种原因,该library(magrittr)
library(data.table)
library(V8)
ctx <- v8()
readLines(textConnection("{'gender': 'Male', 'nationality': 'POL', 'document_type': 'national_identity_card', 'date_of_expiry': '2024-07-21', 'issuing_country': 'POL'}
{'gender': 'Female', 'nationality': 'LTU', 'document_type': 'national_identity_card', 'date_of_expiry': '2023-06-27', 'issuing_country': 'LTU'}
{'document_type': 'driving_licence', 'date_of_expiry': '2044-12-14', 'issuing_country': 'GRC'}
{'gender': 'Male', 'document_type': 'driving_licence', 'date_of_expiry': '2024-08-05', 'issuing_country': 'GBR'}")) %>%
lapply(function(line) {
ctx$eval(sprintf("var line = eval('(' + \"%s\" +')');", line))
ctx$get("line")
}) %>%
data.table::rbindlist(fill=TRUE) %>%
as.data.frame()
## gender nationality document_type date_of_expiry issuing_country
## 1 Male POL national_identity_card 2024-07-21 POL
## 2 Female LTU national_identity_card 2023-06-27 LTU
## 3 <NA> <NA> driving_licence 2044-12-14 GRC
## 4 Male <NA> driving_licence 2024-08-05 GBR
对于指定javascript对象的方式都不太挑剔:
//
// GameScene.swift
// Flarrow
//
// Created by Денис Андрейчук on 11/9/18.
// Copyright © 2018 Денис Андрейчук. All rights reserved.
//
import SpriteKit
class GameScene: SKScene {
//For state control
enum State {
case moveUp
case moveLeft
case moveRight
}
var currentState = State.moveUp
var pathArray = [CGPoint]()
let line = SKShapeNode()
override func didMove(to view: SKView) {
self.backgroundColor = .gray
currentState = .moveUp
pathArray.append(CGPoint(x: 0, y: -UIScreen.main.bounds.height))
pathArray.append(CGPoint(x: 0, y: -UIScreen.main.bounds.height))
createLine()
}
//When user touch on screen, check for touch position, change state based on that
//and add duplicate of current last point
override func touchesBegan(_ touches: Set<UITouch>, with event: UIEvent?) {
let touch = touches.first!
let location = touch.location(in: self)
if location.x > 0 {
currentState = .moveRight
pathArray.append(pathArray[pathArray.endIndex - 1])
} else {
currentState = .moveLeft
pathArray.append(pathArray[pathArray.endIndex - 1])
}
}
//Init line
func createLine() {
let path = CGMutablePath()
path.move(to: pathArray[0])
for point in pathArray {
path.addLine(to: point)
}
line.path = path
line.fillColor = .clear
line.lineWidth = 5
line.strokeColor = .red
self.addChild(line)
}
//Update last point possition based on current state
override func update(_ currentTime: TimeInterval) {
let path = CGMutablePath()
path.move(to: pathArray[0])
switch currentState {
case .moveUp:
pathArray[1].y += 1
case .moveLeft:
pathArray[pathArray.endIndex - 1].y += 1
pathArray[pathArray.endIndex - 1].x -= 1
case .moveRight:
pathArray[pathArray.endIndex - 1].y += 1
pathArray[pathArray.endIndex - 1].x += 1
}
for point in pathArray {
path.addLine(to: point)
}
line.path = path
}
}
由于我们对您的数据一无所知,因此两者都是脆弱的操作。即是否有可能将单引号转义?是否有嵌入双引号的机会?这是您的数据使用的格式吗?
请注意,V8很难在某些系统上运行,但是如果无法在您的系统上安装V8,请不要对此发表评论。