我使用Databricks本身的以下代码来说明如何在Scala https://docs.databricks.com/notebooks/notebook-workflows.html#run-multiple-notebooks-concurrently中并行运行其笔记本。我尝试添加重试功能,如果序列中的笔记本之一失败,它将根据我传递给它的重试值重试该笔记本。
这是Databricks的并行笔记本代码:
//parallel notebook code
import scala.concurrent.{Future, Await}
import scala.concurrent.duration._
import scala.util.control.NonFatal
case class NotebookData(path: String, timeout: Int, parameters: Map[String, String] = Map.empty[String, String])
def parallelNotebooks(notebooks: Seq[NotebookData]): Future[Seq[String]] = {
import scala.concurrent.{Future, blocking, Await}
import java.util.concurrent.Executors
import scala.concurrent.ExecutionContext
import com.databricks.WorkflowException
val numNotebooksInParallel = 5
// If you create too many notebooks in parallel the driver may crash when you submit all of the jobs at once.
// This code limits the number of parallel notebooks.
implicit val ec = ExecutionContext.fromExecutor(Executors.newFixedThreadPool(numNotebooksInParallel))
val ctx = dbutils.notebook.getContext()
Future.sequence(
notebooks.map { notebook =>
Future {
dbutils.notebook.setContext(ctx)
if (notebook.parameters.nonEmpty)
dbutils.notebook.run(notebook.path, notebook.timeout, notebook.parameters)
else
dbutils.notebook.run(notebook.path, notebook.timeout)
}
.recover {
case NonFatal(e) => s"ERROR: ${e.getMessage}"
}
}
)
}
这是我如何调用以上代码以运行多个示例笔记本的示例:
import scala.concurrent.Await
import scala.concurrent.duration._
import scala.language.postfixOps
val notebooks = Seq(
NotebookData("Notebook1", 0, Map("client"->client)),
NotebookData("Notebook2", 0, Map("client"->client))
)
val res = parallelNotebooks(notebooks)
Await.result(res, 3000000 seconds) // this is a blocking call.
res.value
答案 0 :(得分:2)
这是一次尝试。由于您的代码无法编译,因此我插入了一些虚拟类。
此外,您没有完全指定所需的行为,因此我做了一些假设。每个连接仅重试5次。如果五次重试后任何期货仍然失败,则整个期货都将失败。这两种行为都可以更改,但是由于您未指定,因此我不确定您想要的是什么。
如果您有任何疑问或希望我对该程序进行更改,请在评论部分告诉我。
object TestNotebookData extends App{
//parallel notebook code
import scala.concurrent.{Future, Await}
import scala.concurrent.duration._
import scala.util.control.NonFatal
case class NotebookData(path: String, timeout: Int, parameters: Map[String, String] = Map.empty[String, String])
case class Context()
case class Notebook(){
def getContext(): Context = Context()
def setContext(ctx: Context): Unit = ()
def run(path: String, timeout: Int, paramters: Map[String, String] = Map()): Seq[String] = Seq()
}
case class Dbutils(notebook: Notebook)
val dbutils = Dbutils(Notebook())
def parallelNotebooks(notebooks: Seq[NotebookData]): Future[Seq[Seq[String]]] = {
import scala.concurrent.{Future, blocking, Await}
import java.util.concurrent.Executors
import scala.concurrent.ExecutionContext
// This code limits the number of parallel notebooks.
implicit val ec = ExecutionContext.fromExecutor(Executors.newFixedThreadPool(numNotebooksInParallel))
val ctx = dbutils.notebook.getContext()
val isRetryable = true
val retries = 5
def runNotebook(notebook: NotebookData): Future[Seq[String]] = {
def retryWrapper(retry: Boolean, current: Int, max: Int): Future[Seq[String]] = {
val fut = Future {runNotebookInner}
if (retry && current < max) fut.recoverWith{ _ => retryWrapper(retry, current + 1, max)}
else fut
}
def runNotebookInner() = {
dbutils.notebook.setContext(ctx)
if (notebook.parameters.nonEmpty)
dbutils.notebook.run(notebook.path, notebook.timeout, notebook.parameters)
else
dbutils.notebook.run(notebook.path, notebook.timeout)
}
retryWrapper(isRetryable, 0, retries)
}
Future.sequence(
notebooks.map { notebook =>
runNotebook(notebook)
}
)
}
val notebooks = Seq(
NotebookData("Notebook1", 0, Map("client"->"client")),
NotebookData("Notebook2", 0, Map("client"->"client"))
)
val res = parallelNotebooks(notebooks)
Await.result(res, 3000000 seconds) // this is a blocking call.
res.value
}
答案 1 :(得分:0)
我发现这可行:
import scala.util.{Try, Success, Failure}
def tryNotebookRun (path: String, timeout: Int, parameters: Map[String, String] = Map.empty[String, String]): Try[Any] = {
Try(
if (parameters.nonEmpty){
dbutils.notebook.run(path, timeout, parameters)
}
else{
dbutils.notebook.run(path, timeout)
}
)
}
//parallel notebook code
import scala.concurrent.{Future, Await}
import scala.concurrent.duration._
import scala.util.control.NonFatal
def runWithRetry(path: String, timeout: Int, parameters: Map[String, String] = Map.empty[String, String], maxRetries: Int = 2) = {
var numRetries = 0
while (numRetries < maxRetries){
tryNotebookRun(path, timeout, parameters) match {
case Success(_) => numRetries = maxRetries
case Failure(_) => numRetries = numRetries + 1
}
}
}
case class NotebookData(path: String, timeout: Int, parameters: Map[String, String] = Map.empty[String, String])
def parallelNotebooks(notebooks: Seq[NotebookData]): Future[Seq[Any]] = {
import scala.concurrent.{Future, blocking, Await}
import java.util.concurrent.Executors
import scala.concurrent.ExecutionContext
import com.databricks.WorkflowException
val numNotebooksInParallel = 5
// If you create too many notebooks in parallel the driver may crash when you submit all of the jobs at once.
// This code limits the number of parallel notebooks.
implicit val ec = ExecutionContext.fromExecutor(Executors.newFixedThreadPool(numNotebooksInParallel))
val ctx = dbutils.notebook.getContext()
Future.sequence(
notebooks.map { notebook =>
Future {
dbutils.notebook.setContext(ctx)
runWithRetry(notebook.path, notebook.timeout, notebook.parameters)
}
.recover {
case NonFatal(e) => s"ERROR: ${e.getMessage}"
}
}
)
}