问题:我需要编写一个应用程序来处理几百个文件,每个文件需要几百兆字节才能完成。我使用Future[Report]
使用Executors.newFixedThreadPool()
对象编写了它,但由于List[Future[Report]]
返回的ExecutorService.invokeAll()
对象保留在中间内存中而导致内存不足错误每个过程使用。我通过在计算Report
值(每Report
只有几百行)之后从处理器中的本地方法返回Report
个对象来解决问题,而不是在{{进行计算1}}方法(来自接口call
)。
我想尝试使用Scala Actors来解决这个问题。我创建了一个类,它接受一系列作业(作业,结果和处理函数的参数化类型),并在一个可配置数量的Callable
个实例(Worker
的子类)中进行处理。代码如下。
问题:
我不确定我的处理是什么 正确的。
我不喜欢使用Actor
来延迟从调度程序返回结果。
我更愿意编写一个更“功能”的调度程序版本,它不会修改CountDownLatch
列表或jobsQueue
哈希映射,也许借用尾递归workers
来自Clojure的结构(我在其他Scala代码中使用了loop
方法)。
我焦急地等待Philipp Haller和Frank Sommers发表"Actors in Scala"。
以下是代码:
@tailrec def loop
答案 0 :(得分:4)
快速浏览后,我建议进行以下更新:
val resultsChannel = new Channel[List[B]] // used instead of countdown latch to get the results
val dispatcher = new Actor {
def act = loop(Nil, (0 to actorCount).map(id =>
(id, new Worker(id).start.asInstanceOf[Worker])).toMap,
Nil)
@tailrec
def loop(jobQueue: List[A], // queue, workers and results are immutable lists, passed recursively through the loop
workers: Map[Int, Worker],
res: List[B]):Unit = react {
case ReportResult(id, result) =>
val results = result :: res
if (results.size == jobs.size) { // when the processing is finished, sends results to the output channel
resultsChannel ! results
}
loop(jobQueue, workers, results)
case SendJob(id) =>
if (!jobQueue.isEmpty) {
workers(id) ! Process(jobQueue.head)
loop(jobQueue.tail, workers, res)
}
case Stopped(id) =>
loop(jobQueue, workers - id, res)
}
}
dispatcher.start()
def results: List[B] = {
resultsChannel.receive {
case results => results // synchronously wait for the data in the channel
}
}
答案 1 :(得分:0)
这是我提出的最终版本(感谢Vasil Remeniuk)。标有println
条评论的// DEBUG
语句用于显示进度,main
方法是单元测试:
import scala.actors.Actor
import scala.actors.Channel
import scala.actors.Scheduler
import scala.annotation.tailrec
object MultiWorker {
private val megabyte = 1024 * 1024
private val runtime = Runtime.getRuntime
def main(args: Array[String]) {
val jobs = (0 until 5).map((value: Int) => value).toList
val multiWorker = new MultiWorker[Int, Int](jobs, 2, { value =>
Thread.sleep(100)
println(value)
value
})
println("multiWorker.results: " + multiWorker.results)
Scheduler.shutdown
}
}
class MultiWorker[A, B](jobs: List[A],
actorCount: Int,
process: (A) => B) {
import MultiWorker._
sealed abstract class Message
// Dispatcher -> Worker: Run this job and report results
case class Process(job: A) extends Message
// Worker -> Dispatcher: Result of processing
case class ReportResult(id: Int, result: B) extends Message
// Worker -> Dispatcher: I need work -- send me a job
case class SendJob(id: Int) extends Message
// Worker -> Dispatcher: I have stopped as requested
case class Stopped(id: Int) extends Message
// Dispatcher -> Worker: Stop working -- all jobs done
case class StopWorking() extends Message
/**
* A simple logger that can be sent text messages that will be written to the
* console. Used so that messages from the actors do not step on each other.
*/
object Logger
extends Actor {
def act() {
loop {
react {
case text: String => println(text)
case StopWorking => exit()
}
}
}
}
Logger.start()
/**
* A worker actor that will process jobs and return results to the
* dispatcher.
*/
case class Worker(id: Int)
extends Actor{
def act() {
// Ask the dispatcher for an initial job
dispatcher ! SendJob(id)
loop {
react {
case Process(job) =>
println("Worker(" + id + "): " + Process(job)) // DEBUG
val startTime = System.nanoTime
dispatcher ! ReportResult(id, process(job))
val endTime = System.nanoTime
val totalMemory = (runtime.totalMemory / megabyte)
val usedMemory = totalMemory - (runtime.freeMemory / megabyte)
val message = "Finished job " + job + " in " +
((endTime - startTime) / 1000000000.0) +
" seconds using " + usedMemory +
"MB out of total " + totalMemory + "MB"
Logger ! message
dispatcher ! SendJob(id)
case StopWorking() =>
println("Worker(" + id + "): " + StopWorking()) // DEBUG
dispatcher ! Stopped(id)
exit()
}
}
}
}
val resultsChannel = new Channel[List[B]]
/**
* The job dispatcher that sends jobs to the worker until the job queue
* (jobs: TraversableOnce[A]) is empty. It then tells the workers to
* stop working and returns the List[B] results to the caller.
*/
val dispatcher = new Actor {
def act() {
@tailrec
def loop(jobs: List[A],
workers: Map[Int, Worker],
acc: List[B]) {
println("dispatcher: loop: jobs: " + jobs + ", workers: " + workers + ", acc: " + acc) // DEBUG
if (!workers.isEmpty) { // Stop recursion when there are no more workers
react {
case ReportResult(id, result) =>
println("dispatcher: " + ReportResult(id, result)) // DEBUG
loop(jobs, workers, result :: acc)
case SendJob(id) =>
println("dispatcher: " + SendJob(id)) // DEBUG
if (!jobs.isEmpty) {
println("dispatcher: " + "Sending: " + Process(jobs.head) + " to " + id) // DEBUG
workers(id) ! Process(jobs.head)
loop(jobs.tail, workers, acc)
} else {
println("dispatcher: " + "Sending: " + StopWorking() + " to " + id) // DEBUG
workers(id) ! StopWorking()
loop(Nil, workers, acc)
}
case Stopped(id) =>
println("dispatcher: " + Stopped(id)) // DEBUG
loop(jobs, workers - id, acc)
}
} else {
println("dispatcher: " + "jobs: " + jobs + ", workers: " + workers + ", acc: " + acc) // DEBUG
resultsChannel ! acc
}
}
loop(jobs, (0 until actorCount).map(id => (id, new Worker(id).start.asInstanceOf[Worker])).toMap, Nil)
exit()
}
}.start()
/**
* Get the results of the processing -- wait for the dispatcher to finish
* before returning.
*/
def results: List[B] = {
resultsChannel.receive {
case results => results
}
}
}