我打算运行一个繁重的spark sql作业(最后附加),并且它运行了很长时间,我检查了执行程序监视器页面,它显示了执行期间消耗的GC时间:
但Storage Memory
非常低。然后我进入舞台监听页面,所有正在运行的任务都有高GC时间:
代码如下,任何人都可以给我一个方法/方向来调试/调整/调整这种情况吗?谢谢!
object etaWithSpeed {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.appName("etaWithSpeed")
.config("num-executors", 15)
.config("executor-memory", 4)
.enableHiveSupport()
.getOrCreate()
import spark.implicits._
spark.sparkContext.setLogLevel("WARN")
import ch.hsr.geohash.{WGS84Point, GeoHash, BoundingBox}
import math._
import scala.annotation.tailrec
def haversine(lat1:Double, lon1:Double, lat2:Double, lon2:Double)={
val dLat=(lat2 - lat1).toRadians
val dLon=(lon2 - lon1).toRadians
val a = pow(sin(dLat/2),2) + pow(sin(dLon/2),2) * cos(lat1.toRadians) * cos(lat2.toRadians)
val c = 2 * asin(sqrt(a))
6372.8 * c
}
def geohash(lat: Double, lon: Double)={
GeoHash.withBitPrecision(lat, lon, 30).toBase32
}
def geohash_grid_expand(geohash: String)={
GeoHash.fromGeohashString(geohash).getAdjacent().map(_.toBase32) :+ geohash
}
def geohash_envelope_between_two_grid(lat1: Double, lon1: Double, lat2: Double, lon2: Double): List[String] = {
val gh1 = GeoHash.withBitPrecision(lat1, lon1, 30)
val gh2 = GeoHash.withBitPrecision(lat2, lon2, 30)
// MARK: decide border
val sortedLons = List(gh1.getBoundingBox.getMaxLon, gh1.getBoundingBox.getMinLon, gh2.getBoundingBox.getMaxLon, gh2.getBoundingBox.getMinLon).sorted
var lonMin = 0.0
var lonMax = 0.0
var kind = 0
if(abs(gh1.getBoundingBoxCenterPoint.getLongitude-gh2.getBoundingBoxCenterPoint.getLongitude) <= 180.0) {
lonMin = sortedLons(0)
lonMax = sortedLons(3)
kind = 0
}
else {
lonMin = sortedLons(1)
lonMax = sortedLons(2)
kind = 1
}
val sortedLats = List(gh1.getBoundingBox.getMinLat, gh1.getBoundingBox.getMaxLat, gh2.getBoundingBox.getMinLat, gh2.getBoundingBox.getMaxLat).sorted
var latMin = sortedLats.min
var latMax = sortedLats.max
def withinEnvelope(ghCur: GeoHash): Boolean = {
val ghCurLon = ghCur.getBoundingBoxCenterPoint.getLongitude
val ghCurLat = ghCur.getBoundingBoxCenterPoint.getLatitude
if(ghCurLat >= latMin && ghCurLat <= latMax) {
if(kind == 0 && ghCurLon >= lonMin && ghCurLon <= lonMax) {
true
}
else if(kind == 1 && (ghCurLon <= lonMin || ghCurLon >= lonMax)) {
true
}
else {
false
}
}
else {
false
}
}
@tailrec
def expand_neighbors_impl(toGoThrough: List[GeoHash], buffer: Set[GeoHash] = Set()): Set[GeoHash] = {
toGoThrough.headOption match {
case None => buffer
case Some(ghCur) =>
if (buffer contains ghCur) {
expand_neighbors_impl(toGoThrough.tail, buffer)
}
else {
val neighbors = get4GeoHashAround(ghCur).filter(withinEnvelope(_))
expand_neighbors_impl(neighbors ++: toGoThrough, buffer + ghCur)
}
}
}
def expand_neighbors(): Set[GeoHash] = expand_neighbors_impl(List(gh1))
def get4GeoHashAround(gh: GeoHash): Array[GeoHash] = {
Array(gh.getNorthernNeighbour, gh.getSouthernNeighbour, gh.getWesternNeighbour, gh.getEasternNeighbour)
}
expand_neighbors.toList.map(_.toBase32)
}
spark.sqlContext.udf.register("haversine", haversine _)
spark.sqlContext.udf.register("geohash", geohash _)
spark.sqlContext.udf.register("geohash_grid_expand", geohash_grid_expand _)
spark.sqlContext.udf.register("geohash_envelope_between_two_grid", geohash_envelope_between_two_grid _)
spark.sql("""with tmp1 as(
select
ied.*,
explode(geohash_envelope_between_two_grid(ied.latitude1, ied.longitude1, ied.latitude2, ied.longitude2)) as geohash_value
from dev_oussama.inhouse_eta_data ied
)
select
tmp1.time,
tmp1.latitude1,
tmp1.longitude1,
tmp1.latitude2,
tmp1.longitude2,
tmp1.google_eta,
tmp1.booking_id,
tmp1.user_id,
tmp1.real_eta,
tmp1.osm_eta,
tmp1.osm_distance,
tmp1.osm_api_result,
sum(gr.avg_speed_mps*gr.no_of_driver_ping)/sum(gr.no_of_driver_ping) as avg_speed_mps
from tmp1
join dev_jason_zhu.geohash_region_metrics gr on tmp1.geohash_value = gr.geohash_value
group by
tmp1.time,
tmp1.latitude1,
tmp1.longitude1,
tmp1.latitude2,
tmp1.longitude2,
tmp1.google_eta,
tmp1.booking_id,
tmp1.user_id,
tmp1.real_eta,
tmp1.osm_eta,
tmp1.osm_distance,
tmp1.osm_api_result
limit 1000""").show()
}
}
P.S。
当我尝试在特定java进程中监视堆中的实例时,将打印以下内容。显然,有很多GeoHash
个相关对象,以某种方式解决它?
bash-4.2$ jmap -histo:live 19808 | head
num #instances #bytes class name
----------------------------------------------
1: 6199 155188800 [B
2: 591 101908160 [J
3: 766706 36801888 ch.hsr.geohash.BoundingBox
4: 766706 24534592 ch.hsr.geohash.GeoHash
5: 766706 24534592 ch.hsr.geohash.WGS84Point
6: 766876 18405024 scala.collection.immutable.$colon$colon
7: 219583 5269992 scala.collection.immutable.HashSet$HashSet1
执行者的stdout日志
2017-04-15T07:41:06.702+0000: [GC (Allocation Failure) 2017-04-15T07:41:06.702+0000: [ParNew: 306688K->34048K(306688K), 0.1675930 secs] 2717803K->2474880K(6257408K), 0.1676444 secs] [Times: user=0.65 sys=0.00, real=0.17 secs]
2017-04-15T07:41:06.991+0000: [GC (Allocation Failure) 2017-04-15T07:41:06.991+0000: [ParNew: 306688K->34048K(306688K), 0.2082425 secs] 2747520K->2511759K(6257408K), 0.2082937 secs] [Times: user=0.80 sys=0.00, real=0.21 secs]
2017-04-15T07:41:07.318+0000: [GC (Allocation Failure) 2017-04-15T07:41:07.318+0000: [ParNew: 306688K->34048K(306688K), 0.1989878 secs] 2784399K->2560107K(6257408K), 0.1990396 secs] [Times: user=0.76 sys=0.00, real=0.20 secs]
2017-04-15T07:41:07.636+0000: [GC (Allocation Failure) 2017-04-15T07:41:07.636+0000: [ParNew: 306688K->34048K(306688K), 0.2265302 secs] 2832747K->2614298K(6257408K), 0.2265810 secs] [Times: user=0.87 sys=0.00, real=0.22 secs]
2017-04-15T07:41:07.981+0000: [GC (Allocation Failure) 2017-04-15T07:41:07.981+0000: [ParNew: 306688K->34048K(306688K), 0.1750708 secs] 2886938K->2657702K(6257408K), 0.1751232 secs] [Times: user=0.66 sys=0.00, real=0.18 secs]
2017-04-15T07:41:08.273+0000: [GC (Allocation Failure) 2017-04-15T07:41:08.273+0000: [ParNew: 306688K->34048K(306688K), 0.2805720 secs] 2930342K->2710186K(6257408K), 0.2806221 secs] [Times: user=1.09 sys=0.00, real=0.28 secs]
2017-04-15T07:41:08.672+0000: [GC (Allocation Failure) 2017-04-15T07:41:08.672+0000: [ParNew: 306688K->34046K(306688K), 0.1939371 secs] 2982826K->2759971K(6257408K), 0.1940055 secs] [Times: user=0.74 sys=0.00, real=0.20 secs]
2017-04-15T07:41:08.985+0000: [GC (Allocation Failure) 2017-04-15T07:41:08.985+0000: [ParNew: 306686K->34046K(306688K), 0.1824374 secs] 3032611K->2803546K(6257408K), 0.1824884 secs] [Times: user=0.70 sys=0.00, real=0.18 secs]
2017-04-15T07:41:09.286+0000: [GC (Allocation Failure) 2017-04-15T07:41:09.286+0000: [ParNew: 306686K->34046K(306688K), 0.2459598 secs] 3076186K->2862064K(6257408K), 0.2460109 secs] [Times: user=0.95 sys=0.00, real=0.24 secs]
2017-04-15T07:41:09.650+0000: [GC (Allocation Failure) 2017-04-15T07:41:09.650+0000: [ParNew: 306686K->34048K(306688K), 0.2248917 secs] 3134704K->2916068K(6257408K), 0.2249424 secs] [Times: user=0.87 sys=0.00, real=0.23 secs]
2017-04-15T07:41:09.994+0000: [GC (Allocation Failure) 2017-04-15T07:41:09.994+0000: [ParNew: 306688K->34048K(306688K), 0.1995261 secs] 3188708K->2963537K(6257408K), 0.1995778 secs] [Times: user=0.77 sys=0.00, real=0.20 secs]
2017-04-15T07:41:10.312+0000: [GC (Allocation Failure) 2017-04-15T07:41:10.312+0000: [ParNew: 306688K->34048K(306688K), 0.2394581 secs] 3236177K->3013025K(6257408K), 0.2395090 secs] [Times: user=0.91 sys=0.01, real=0.24 secs]
2017-04-15T07:41:10.675+0000: [GC (Allocation Failure) 2017-04-15T07:41:10.675+0000: [ParNew: 306688K->34048K(306688K), 0.2054829 secs] 3285665K->3065302K(6257408K), 0.2055344 secs] [Times: user=0.78 sys=0.00, real=0.20 secs]
2017-04-15T07:41:10.999+0000: [GC (Allocation Failure) 2017-04-15T07:41:10.999+0000: [ParNew: 306688K->34046K(306688K), 0.2380438 secs] 3337942K->3122984K(6257408K), 0.2380944 secs] [Times: user=0.91 sys=0.00, real=0.24 secs]
2017-04-15T07:41:11.360+0000: [GC (Allocation Failure) 2017-04-15T07:41:11.360+0000: [ParNew: 306686K->34046K(306688K), 0.2232903 secs] 3395624K->3177798K(6257408K), 0.2233435 secs] [Times: user=0.85 sys=0.00, real=0.23 secs]
2017-04-15T07:41:11.703+0000: [GC (Allocation Failure) 2017-04-15T07:41:11.703+0000: [ParNew: 306686K->34046K(306688K), 0.1517400 secs] 3450438K->3215043K(6257408K), 0.1517907 secs] [Times: user=0.58 sys=0.00, real=0.14 secs]
20