我目前正在将Spark与DataFrames结合使用,以对每个Row发出一系列HTTP GET请求。我目前正在使用Apache HttpClient 4.5.x依赖项来进行GET请求。我注意到,当我在本地模式(--master local[256]
中运行时,我能够获得相当多的GET /秒(大约100 / s)。但是,(毫不奇怪),当我尝试扩展到超过256个本地驱动程序线程时,性能开始下降。由于HttpClient的长时间IO阻塞,我可以在4核笔记本电脑上运行256个本地线程。
然后我切换到AWS EMR上的5个工作节点的Spark集群(每个工作节点的内核是我的笔记本电脑的两倍),希望模拟每个工作节点相同的256个线程设置以尝试获得5倍的吞吐量。当我将spark-submit
设置为高于YARN报告的可用vCore的数字时,--executor-cores
无法启动。
有没有一种方法可以轻松地在5个工作线程上复制local [256]行为,从而使吞吐量大约增加5倍?
我尝试了一些结果低于标准的选项。
local[256]
一样多的GET(尽管已经很接近了)。旁注:这篇文章解释了我无法转换的更新的“滑动迭代器”(自从他更改了代码以使用Google提供的Futures库以来,我一直在坚持使用scala.concurrent版本)。 / li>
这是为5工人节点设置编写的当前代码库。 local[256]
设置简单df.map(row => getRow(row)
package com.me.bot
import java.util.concurrent.{Executors, TimeUnit}
import org.apache.http.client.methods.{CloseableHttpResponse, HttpGet}
import org.apache.http.client.protocol.HttpClientContext
import org.apache.http.impl.client.HttpClients
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager
import org.apache.spark.sql.{Row, SparkSession}
object Bot {
val MaxConcurrency: Int = 128
def main(args: Array[String]) = {
val spark = SparkSession.builder().appName("Bot").getOrCreate()
import spark.implicits._
val df = spark.read.option("header", "true").csv("/path/to/urls.csv")
df.rdd.
map(row => ThreadedConcurrentContext.executeAsync(getRow(row))).
mapPartitions(it => ThreadedConcurrentContext.awaitSliding(it, MaxConcurrency)).
toDF().write.mode("append").parquet("/path/to/output")
spark.stop()
}
def getRow(row: Row) = {
val url = row.getAs[String]("url")
GetResult(Browser.getStatusCode(url), url)
}
}
case class GetResult(statusCode: Int, url: String)
object Browser extends Serializable {
lazy val cm = {
val manager = new PoolingHttpClientConnectionManager()
manager.setMaxTotal(Bot.MaxConcurrency)
manager.setDefaultMaxPerRoute(Bot.MaxConcurrency)
manager
}
lazy val httpClient = HttpClients.custom().setConnectionManager(cm).setConnectionTimeToLive(5, TimeUnit.MINUTES).build()
val context = new ThreadLocal[HttpClientContext] {
override def initialValue = HttpClientContext.create()
}
def getStatusCode(url: String) = {
var response: CloseableHttpResponse = null
try {
val httpget = new HttpGet(url)
response = httpClient.execute(httpget, context.get())
response.getStatusLine.getStatusCode
}
catch {
case ex: Exception => -1
case _: Throwable => -1
}
// soooo not Scala... sorry!
finally {
if (response != null) {
response.close()
}
}
}
}
// From http://www.russellspitzer.com/2017/02/27/Concurrency-In-Spark/
/** A singleton object that controls the parallelism on a Single Executor JVM */
object ThreadedConcurrentContext {
import scala.concurrent._
import scala.concurrent.duration.Duration
import scala.concurrent.duration.Duration._
implicit val ec = ExecutionContext.fromExecutorService(Executors.newWorkStealingPool(Bot.MaxConcurrency))
/** Wraps a code block in a Future and returns the future */
def executeAsync[T](f: => T): Future[T] = Future(f)
/** Awaits only a set of elements at a time. Instead of waiting for the entire batch
* to finish waits only for the head element before requesting the next future*/
def awaitSliding[T](it: Iterator[Future[T]], batchSize: Int, timeout: Duration = Inf): Iterator[T] = {
val slidingIterator = it.sliding(batchSize - 1) //Our look ahead (hasNext) will auto start the nth future in the batch
val (initIterator, tailIterator) = slidingIterator.span(_ => slidingIterator.hasNext)
initIterator.map( futureBatch => Await.result(futureBatch.head, timeout)) ++
tailIterator.flatMap( lastBatch => Await.result(Future.sequence(lastBatch), timeout))
}
def awaitAll[T](it: Iterator[Future[T]], timeout: Duration = Inf) = {
Await.result(Future.sequence(it), timeout)
}
}
我想使用5个工作节点来达到5倍吞吐量的标准。我没有计划进一步扩展,因为我的网络服务器可能无法处理更多内容。