我正在使用Scala为Binary中的大输入文件编写外部合并排序。 我使用gensort生成输入并使用来自此网站的valsort评估输出:http://www.ordinal.com/gensort.html
我将一次读取100个字节,Key(List[Byte])
的前10个字节,Value(List[Byte])
的其余90个字节
排序后,我的输出由valsort评估,这是错误的。
但是当我在ASCII中使用输入时,我的输出是正确的。
所以我想知道如何以正确的方式对二进制输入进行排序?
Valsort说我的第一个无序记录是56,这是我打印出来的:
50 --> Key(List(-128, -16, 5, -10, -83, 23, -107, -109, 42, -11))
51 --> Key(List(-128, -16, 5, -10, -83, 23, -107, -109, 42, -11))
52 --> Key(List(-128, -10, -10, 68, -94, 37, -103, 30, 90, 16))
53 --> Key(List(-128, -10, -10, 68, -94, 37, -103, 30, 90, 16))
54 --> Key(List(-128, -10, -10, 68, -94, 37, -103, 30, 90, 16))
55 --> Key(List(-128, -10, -10, 68, -94, 37, -103, 30, 90, 16))
56 --> Key(List(-128, 0, -27, -4, -82, -82, 121, -125, -22, 99))
57 --> Key(List(-128, 0, -27, -4, -82, -82, 121, -125, -22, 99))
58 --> Key(List(-128, 0, -27, -4, -82, -82, 121, -125, -22, 99))
59 --> Key(List(-128, 0, -27, -4, -82, -82, 121, -125, -22, 99))
60 --> Key(List(-128, 7, -65, 118, 121, -12, 48, 50, 59, -8))
61 --> Key(List(-128, 7, -65, 118, 121, -12, 48, 50, 59, -8))
62 --> Key(List(-128, 7, -65, 118, 121, -12, 48, 50, 59, -8))
这是我的外部排序代码:
package externalsorting
import java.io.{BufferedOutputStream, File, FileOutputStream}
import java.nio.channels.FileChannel
import java.util.Calendar
import scala.collection.mutable
import readInput._
import scala.collection.mutable.ListBuffer
/**
* Created by hminle on 12/5/2016.
*/
object ExternalSortingExample extends App{
val dir: String = "C:\\ShareUbuntu\\testMerge"
val listFile: List[File] = Utils.getListOfFiles(dir)
listFile foreach(x => println(x.getName))
var fileChannelsInput: List[(FileChannel, Boolean)] = listFile.map{input => (Utils.getFileChannelFromInput(input), false)}
val tempDir: String = dir + "/tmp/"
val tempDirFile: File = new File(tempDir)
val isSuccessful: Boolean = tempDirFile.mkdir()
if(isSuccessful) println("Create temp dir successfully")
else println("Create temp dir failed")
var fileNameCounter: Int = 0
val chunkSize = 100000
// Split big input files into small chunks
while(!fileChannelsInput.isEmpty){
if(Utils.estimateAvailableMemory() > 400000){
val fileChannel = fileChannelsInput(0)._1
val (chunks, isEndOfFileChannel) = Utils.getChunkKeyAndValueBySize(chunkSize, fileChannel)
if(isEndOfFileChannel){
fileChannel.close()
fileChannelsInput = fileChannelsInput.drop(1)
} else {
val sortedChunk: List[(Key, Value)] = Utils.getSortedChunk(chunks)
val fileName: String = tempDir + "partition-" + fileNameCounter
Utils.writePartition(fileName, sortedChunk)
fileNameCounter += 1
}
} else {
println(Thread.currentThread().getName +"There is not enough available free memory to continue processing" + Utils.estimateAvailableMemory())
}
}
val listTempFile: List[File] = Utils.getListOfFiles(tempDir)
val start = Calendar.getInstance().getTime
val tempFileChannels: List[FileChannel] = listTempFile.map(Utils.getFileChannelFromInput(_))
val binaryFileBuffers: List[BinaryFileBuffer] = tempFileChannels.map(BinaryFileBuffer(_))
binaryFileBuffers foreach(x => println(x.toString))
val pq1: ListBuffer[BinaryFileBuffer] = ListBuffer.empty
binaryFileBuffers.filter(!_.isEmpty()).foreach(pq1.append(_))
val outputDir: String = dir + "/mergedOutput"
val bos = new BufferedOutputStream(new FileOutputStream(outputDir))
// Start merging temporary files
while(pq1.length > 0){
val pq2 = pq1.toList.sortWith(_.head()._1 < _.head()._1)
val buffer: BinaryFileBuffer = pq2.head
val keyVal: (Key, Value) = buffer.pop()
val byteArray: Array[Byte] = Utils.flattenKeyValue(keyVal).toArray[Byte]
Stream.continually(bos.write(byteArray))
if(buffer.isEmpty()){
buffer.close()
pq1 -= buffer
}
count+=1
}
bos.close()
}
这是BinaryFileBuffer.scala - &gt;这只是一个包装
package externalsorting
import java.nio.channels.FileChannel
import readInput._
/**
* Created by hminle on 12/5/2016.
*/
object BinaryFileBuffer{
def apply(fileChannel: FileChannel): BinaryFileBuffer = {
val buffer: BinaryFileBuffer = new BinaryFileBuffer(fileChannel)
buffer.reload()
buffer
}
}
class BinaryFileBuffer(fileChannel: FileChannel) extends Ordered[BinaryFileBuffer] {
private var cache: Option[(Key, Value)] = _
def isEmpty(): Boolean = cache == None
def head(): (Key, Value) = cache.get
def pop(): (Key, Value) = {
val answer = head()
reload()
answer
}
def reload(): Unit = {
this.cache = Utils.get100BytesKeyAndValue(fileChannel)
}
def close(): Unit = fileChannel.close()
def compare(that: BinaryFileBuffer): Int = {
this.head()._1.compare(that.head()._1)
}
}
这是我的Utils.scala:
package externalsorting
import java.io.{BufferedOutputStream, File, FileOutputStream}
import java.nio.ByteBuffer
import java.nio.channels.FileChannel
import java.nio.file.Paths
import readInput._
import scala.annotation.tailrec
import scala.collection.mutable.ListBuffer
/**
* Created by hminle on 12/5/2016.
*/
object Utils {
def getListOfFiles(dir: String): List[File] = {
val d = new File(dir)
if(d.exists() && d.isDirectory){
d.listFiles.filter(_.isFile).toList
} else List[File]()
}
def get100BytesKeyAndValue(fileChannel: FileChannel): Option[(Key, Value)] = {
val size = 100
val buffer = ByteBuffer.allocate(size)
buffer.clear()
val numOfByteRead = fileChannel.read(buffer)
buffer.flip()
if(numOfByteRead != -1){
val data: Array[Byte] = new Array[Byte](numOfByteRead)
buffer.get(data, 0, numOfByteRead)
val (key, value) = data.splitAt(10)
Some(Key(key.toList), Value(value.toList))
} else {
None
}
}
def getFileChannelFromInput(file: File): FileChannel = {
val fileChannel: FileChannel = FileChannel.open(Paths.get(file.getPath))
fileChannel
}
def estimateAvailableMemory(): Long = {
System.gc()
val runtime: Runtime = Runtime.getRuntime
val allocatedMemory: Long = runtime.totalMemory() - runtime.freeMemory()
val presFreeMemory: Long = runtime.maxMemory() - allocatedMemory
presFreeMemory
}
def writePartition(dir: String, keyValue: List[(Key, Value)]): Unit = {
val byteArray: Array[Byte] = flattenKeyValueList(keyValue).toArray[Byte]
val bos = new BufferedOutputStream(new FileOutputStream(dir))
Stream.continually(bos.write(byteArray))
bos.close()
}
def flattenKeyValueList(keyValue: List[(Key,Value)]): List[Byte] = {
keyValue flatten {
case (Key(keys), Value(values)) => keys:::values
}
}
def flattenKeyValue(keyVal: (Key, Value)): List[Byte] = {
keyVal._1.keys:::keyVal._2.values
}
def getChunkKeyAndValueBySize(size: Int, fileChannel: FileChannel): (List[(Key, Value)], Boolean) = {
val oneKeyValueSize = 100
val countMax = size / oneKeyValueSize
var isEndOfFileChannel: Boolean = false
var count = 0
val chunks: ListBuffer[(Key, Value)] = ListBuffer.empty
do{
val keyValue = get100BytesKeyAndValue(fileChannel)
if(keyValue.isDefined) chunks.append(keyValue.get)
isEndOfFileChannel = !keyValue.isDefined
count += 1
}while(!isEndOfFileChannel && count < countMax)
(chunks.toList, isEndOfFileChannel)
}
def getSortedChunk(oneChunk: List[(Key, Value)]): List[(Key, Value)] = {
oneChunk.sortWith((_._1 < _._1))
}
}
我如何定义键和值:
case class Key(keys: List[Byte]) extends Ordered[Key] {
def isEmpty(): Boolean = keys.isEmpty
def compare(that: Key): Int = {
compare_aux(this.keys, that.keys)
}
private def compare_aux(keys1: List[Byte], keys2: List[Byte]): Int = {
(keys1, keys2) match {
case (Nil, Nil) => 0
case (list, Nil) => 1
case (Nil, list) => -1
case (hd1::tl1, hd2::tl2) => {
if(hd1 > hd2) 1
else if(hd1 < hd2) -1
else compare_aux(tl1, tl2)
}
}
}
}
case class Value(values: List[Byte])
答案 0 :(得分:0)
我找到了答案。从二进制和ASCII读取是不同的。
In what order should the sorted file be?
For binary records (GraySort or MinuteSort), the 10-byte keys should be ordered as arrays of unsigned bytes. The memcmp() library routine can be used for this purpose.
为了对二进制进行排序,我需要将有符号字节转换为无符号字节。