RDD转换不正确

时间:2017-03-31 00:23:14

标签: apache-spark pyspark

我无法重复使用通过执行mapPartition形成的新RDD。只有在mapPartition之后添加reduceByKey转换(我不想执行reduceByKey)时,代码似乎才能正常工作。由于reduceByKeymapParition都是转换,因此我不确定导致错误的原因。

C = [x for x in xrange(2**20)]
C = sc.parallelize(C)
while True:
    C = C.repartition(1000)\
        .mapPartitions(foo)\
        .persist()

    if C.take(1) == 0 :
        break

报告错误:

TypeError: can't pickle listiterator objects

1 个答案:

答案 0 :(得分:0)

错误是由于你从mapPartitions返回的内容是我所相信的。如果您返回一个新的RDD,我没有看到代码引发异常的原因。以下是我使用您的代码开发的代码段。

    var customIcons = {
                type1: {
                  icon: 'icon_type1.png'
                },
                type2: {
                  icon: 'icon_type2.png'
                },
                type3: {
                  icon: 'icon_type3.png'
                },
                type4: {
                  icon: 'icon_type4.png'
                }
        };

        function initMap() {
          var cluster = [];
          var map = new google.maps.Map(document.getElementById("map"), {
            center: new google.maps.LatLng(0, 0),
            zoom: 1,
            mapTypeId: 'roadmap'
          });
    var infowindow = new google.maps.InfoWindow();

          // Change this depending on the name of your PHP file
          downloadUrl('https://my-website.com/the-sweet-sweet-xml-info.php', function(data) {
            var xml = data.responseXML;
            var markers = xml.documentElement.getElementsByTagName("marker");
            for (var i = 0; i < markers.length; i++) {
              var name = markers[i].getAttribute("name");
              var address = markers[i].getAttribute("address");
              var type = markers[i].getAttribute("type");
              var point = new google.maps.LatLng(
                  parseFloat(markers[i].getAttribute("lat")),
                  parseFloat(markers[i].getAttribute("lng")));

              var html= "<b>" + 
              markers[i].getAttribute("name") + 
              "</b> <br/>" + 
              markers[i].getAttribute("address");

              var icon = customIcons[type] || {};
              var marker = new google.maps.Marker({
                map: map,
                position: point,
                icon: icon.icon,
              });
              google.maps.event.addListener(marker, 'click', (function(marker, i) {
                            return function() {
                                infowindow.setContent(
                                "<b>" + 
                                markers[i].getAttribute("name") + 
                                "</b> <br/>" + 
                                markers[i].getAttribute("address")
                                );
                                infowindow.open(map, marker);

                                //This sends information from the clicked icon back to the serverside code
                                document.getElementById("setlatlng").innerHTML = markers[i].getAttribute("name");
                            }
                        })(marker, i));
              cluster.push(marker);
            }

            var options = {
                  imagePath: '/location-of-cluster-icons/m'
              };

            var mc = new MarkerClusterer(map,cluster,options);
          });
        }

        function bindInfoWindow(marker, map, infoWindow, html) {
          google.maps.event.addListener(marker, 'click', function() {
            infoWindow.setContent(html);
            infoWindow.open(map, marker);

          });
        }

        function downloadUrl(url, callback) {
          var request = window.ActiveXObject ?
              new ActiveXObject('the-sweet-sweet-xml-info.php') :
              new XMLHttpRequest;

          request.onreadystatechange = function() {
            if (request.readyState == 4) {
              request.onreadystatechange = doNothing;
              callback(request, request.status);
            }
          };

          request.open('GET', url, true);
          request.send(null);
        }

        function doNothing() {}

最好使用def f(it): s = 0 l = 0 for x in it: s += x l += 1 if l > 1: yield s C = sc.parallelize([x for x in range(100)]) while True: C = C.repartition(10)\ .mapPartitions(f) if C.isEmpty(): break 而不是isEmpty()来检查RDD是否为空。