我有二进制文件,其中包含地点和坐标(纬度,经度)的名称,每当我使用编码String
将其解析为.ascii
时,它都不会很好地解析它。我假设从Float
值(坐标)解析失败。
阅读InputStream
extension Data {
init(reading input: InputStream) {
self.init()
input.open()
let bufferSize = 1024
let buffer = UnsafeMutablePointer<UInt8>.allocate(capacity: bufferSize)
while input.hasBytesAvailable {
let read = input.read(buffer, maxLength: bufferSize)
self.append(buffer, count: read)
}
buffer.deallocate()
input.close()
}
}
let filepath = Bundle.main.path(forResource: "MN", ofType: "dat")
let data = Data.init(reading: InputStream(fileAtPath: filepath)!)
let parsedData = String.init(data: data, encoding: .ascii)
任何想法我怎么能以正确的方式解析它?
例如,Java ObjectInputStream
具有名为:
inputStreamObj.readUTF()
inputStreamObj.readFloat()
爪哇
答案 0 :(得分:2)
正如我在评论中所写,您需要阅读规范Object Serialization Stream Protocol。
因此,前4个字节表示STREAM_MAGIC,STREAM_VERSION,预计始终为相同的值。 5字节序列0x7A 0xhh 0xhh 0xhh 0xhh表示TC_BLOCKDATALONG(0xhhhhhhhh)。
在解析字符串和浮点数之前,需要连接所有块。
所以,准备DataReader
:
(与Sulthan's几乎相同,但这正确地对待了修改后的UTF-8。)
struct DataReader {
enum DataReaderError: Error {
case invalidFirstByte(byte: UInt16, offset: Int)
case invalidFollowingByte
case missingFollowingByte
case insufficientData
}
var data: Data
var currentPosition: Int
init(data: Data) {
self.data = data
self.currentPosition = 0
}
mutating func skipBytes(_ n: Int) {
currentPosition += n
}
private mutating func readBigEndian<T: FixedWidthInteger>() throws -> T {
guard currentPosition + MemoryLayout<T>.size <= data.count else {
throw DataReaderError.insufficientData
}
var fixedWithInteger: T = 0
let range: Range<Int> = currentPosition ..< currentPosition + MemoryLayout<T>.size
withUnsafeMutableBytes(of: &fixedWithInteger) {ptrT in
let uint8Ptr = ptrT.baseAddress!.assumingMemoryBound(to: UInt8.self)
data.copyBytes(to: uint8Ptr, from: range)
}
currentPosition += MemoryLayout<T>.size
return fixedWithInteger.bigEndian
}
mutating func readFloat() throws -> Float {
let floatBits: UInt32 = try readBigEndian()
return Float(bitPattern: floatBits)
}
mutating func readUnsignedShort() throws -> Int {
let ushortValue: UInt16 = try readBigEndian()
return Int(ushortValue)
}
mutating func readInt() throws -> Int {
let intValue: Int32 = try readBigEndian()
return Int(intValue)
}
mutating func readUnsignedByte() throws -> Int {
guard currentPosition < data.count else {
throw DataReaderError.insufficientData
}
let byte = data[currentPosition]
currentPosition += 1
return Int(byte)
}
mutating func readBytes(_ n: Int) throws -> Data {
guard currentPosition + n <= data.count else {
throw DataReaderError.insufficientData
}
let subdata = data[currentPosition ..< currentPosition+n]
currentPosition += n
return subdata
}
mutating func readUTF() throws -> String {
//Get byte size of the string
let count = try readUnsignedShort()
//Decoding Modified UTF-8
var utf16: [UInt16] = []
var offset = 0
while offset < count {
let firstByte = UInt16(data[currentPosition + offset])
if firstByte & 0b1_0000000 == 0b0_0000000 {
utf16.append(firstByte)
offset += 1
} else if firstByte & 0b111_00000 == 0b110_00000 {
guard offset + 1 < count else {throw DataReaderError.missingFollowingByte}
let secondByte = UInt16(data[currentPosition + offset + 1])
guard secondByte & 0b11_000000 == 0b10_000000 else {throw DataReaderError.invalidFollowingByte}
let codeUnit = ((firstByte & 0b000_11111) << 6) | (secondByte & 0b00_111111)
utf16.append(codeUnit)
offset += 2
} else if firstByte & 0b1111_0000 == 0b1110_0000 {
guard offset + 2 < count else {throw DataReaderError.missingFollowingByte}
let secondByte = UInt16(data[currentPosition + offset + 1])
guard secondByte & 0b11_000000 == 0b10_000000 else {throw DataReaderError.invalidFollowingByte}
let thirdByte = UInt16(data[currentPosition + offset + 2])
guard thirdByte & 0b11_000000 == 0b10_000000 else {throw DataReaderError.invalidFollowingByte}
let codeUnit = ((firstByte & 0b0000_1111) << 12) | ((secondByte & 0b00_111111) << 6) | (thirdByte & 0b00_111111)
utf16.append(codeUnit)
offset += 3
} else {
throw DataReaderError.invalidFirstByte(byte: firstByte, offset: currentPosition+offset)
}
}
currentPosition += offset
return String(utf16CodeUnits: &utf16, count: utf16.count)
}
var isAtEnd: Bool {
return currentPosition == data.count
}
}
我们可以按如下方式解析您的MN.dat
:
let mnUrl = Bundle.main.url(forResource: "MN", withExtension: "dat")!
do {
let data = try Data(contentsOf: mnUrl)
var reader = DataReader(data: data)
reader.skipBytes(4)
//First collect all blocks
var blockData = Data()
while !reader.isAtEnd {
let contentType = try reader.readUnsignedByte()
if contentType == 0x7A {//TC_BLOCKDATALONG
let size = try reader.readInt()
let block = try reader.readBytes(size)
blockData.append(block)
} else if contentType == 0x77 {//TC_BLOCKDATA
let size = try reader.readUnsignedByte()
let block = try reader.readBytes(size)
blockData.append(block)
} else {
print("Unsupported content type")
break
}
}
//Then read the contents of blockData
var blockReader = DataReader(data: blockData)
while !blockReader.isAtEnd {
let string = try blockReader.readUTF()
print(string)
let float1 = try blockReader.readFloat()
print(float1)
let float2 = try blockReader.readFloat()
print(float2)
//Use string, float1, float2 as you like
}
} catch {
print(error)
}
输出:
Albert Lea
43.648
-93.3683
Albertville
45.2377
-93.6544
Alexandria
45.8852
-95.3775
(... no errors...)
Woodbury
44.9239
-92.9594
Worthington
43.62
-95.5964
Wyoming
45.3364
-92.9972
Zimmerman
45.4433
-93.59
如果您的二进制数据可能包含其他内容类型,则可能需要修改上述代码。
答案 1 :(得分:1)
我将向您展示如何解析Java编码的数据。但是,由于我无法理解文件的格式,因此响应将不完整:
首先,加载文件:
// load the file
let fileUrl = URL(fileURLWithPath: "/Users/sulthan/Downloads/MN.dat")
let data = try! Data(contentsOf: fileUrl)
其次,创建一个简单的Java数据读取器:
// create a simple data reader
class Reader {
let data: Data
private var offset = 0
init(data: Data) {
self.data = data
}
var hasMoreData: Bool {
return offset < data.count
}
func skip(length: Int) {
offset += length
}
func readByte() -> UInt8 {
defer { offset += 1}
return data[offset]
}
// java bytes are unsigned
func readJavaByte() -> Int8 {
return Int8(bitPattern: readByte())
}
func readBytes(length: Int) -> Data {
defer { offset += length }
return data.subdata(in: offset ..< offset + length)
}
private func readJavaUShort() -> UInt16 {
let byte1 = UInt16(exactly: readByte())!
let byte2 = UInt16(exactly: readByte())!
return (byte1 << 8) | byte2
}
func readJavaShort() -> Int16 {
return Int16(bitPattern: readJavaUShort())
}
// Java UTF-8 encodes the length as first two bytes (unsigned java short)
func readJavaUtf() -> String? {
let length = readJavaUShort()
let data = readBytes(length: Int(length))
return String(data: data, encoding: .utf8)
}
private func readUInt32() -> UInt32 {
let short1 = UInt32(exactly: readJavaUShort())!
let short2 = UInt32(exactly: readJavaUShort())!
return (short1 << 16) | short2
}
func readJavaInt() -> Int32 {
let short1 = Int32(exactly: readJavaShort())!
let short2 = Int32(exactly: readJavaShort())!
return (short1 << 16) | short2
}
// interpret the 4 bytes as a floating point number
func readJavaFloat() -> Float {
let bits = readUInt32()
return Float(bitPattern: bits)
}
}
第三,解析数据。由于数据格式未知,我不能完全做到这一点:
// create a reader from our data
let reader = Reader(data: data)
// some data I don't understand
reader.skip(length: 4)
var offset = 0
while reader.hasMoreData {
// some data I don't understand in the beginning and after every 52 items
if offset % 53 == 0 {
reader.skip(length: 5)
}
print(reader.readJavaUtf())
print(reader.readJavaFloat())
print(reader.readJavaFloat())
offset += 1
}
在解析某些项目后,数据解析将与提供的数据一起崩溃。我假设您知道如何处理,因为您知道格式。