Question

我正在尝试将Spark Structured Streaming与kafka连接，它会抛出以下错误：

线程“main”中的异常java.lang.ClassNotFoundException：无法找到数据源：kafka。请在...找到包裹。

基于documentation我添加了所需的依赖项

我的kafka和zookeeper服务器正在运行。不确定是什么问题。另外，我这样使用它

import spark.implicits._
val feedback =spark.readStream.format("kafka").option("kafka.bootstrap.servers", "localhost:2181").option("subscribe", "kafka_input_topic")
      .load().as[InputMessage].filter(_.lang.equals("en"))

感谢任何帮助。谢谢

Answer 1

问题是在运行时（不是构建时），CLASSPATH中没有包含必要的jar。

基于链接到的documentation，将所需的依赖项添加到了构建定义文件中（pom.xml或build.sbt或build.gradle），但是例外情况发生在您尝试运行构建后的应用程序，不是吗？

您错过的是有关部署的文档部分，即Deploying：

与任何Spark应用程序一样，spark-submit用于启动您的应用程序。 spark-sql-kafka-0-10_2.11及其依赖项可以使用--packages（例如，
）直接添加到spark-submit中
./bin/spark-submit --packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.2.0 ..

您必须添加此--packages，或者必须创建一个uber-jar，它将使依赖项成为jar文件的一部分。

Answer 2

正如您在评论中提到的那样，问题是：

<scope>provided</scope>

删除sql-kafka的provided范围，因为Spark安装未提供。

Answer 3

如果使用maven，则以下构建具有依赖关系的jar的方法可能会解决您的问题。

添加如下的spark依赖项：

// BASE SETUP
// =============================================================================

// call the packages we need
var express    = require('express');
var bodyParser = require('body-parser');
var app        = express();

// configure app
app.use(bodyParser.urlencoded({ extended: true }));
app.use(bodyParser.json());

var port     = process.env.PORT || 8080; // set our port

var mongoose   = require('mongoose');
mongoose.connect('mongodb://localhost//:20717'); // connect to our database
var Students     = require('./app/models/students');

// ROUTES FOR OUR API
// =============================================================================

// create our router
var router = express.Router();

// middleware to use for all requests
router.use(function(req, res, next) {
    // do logging
    console.log('Something is happening.');
    next();
});

// test route to make sure everything is working (accessed at GET http://localhost:8080/api)
router.get('/', function(req, res) {
    res.json({ message: 'hooray! welcome to our api!' });   
});


router.route('/students')

    // create a students (accessed at POST http://localhost:8080/students)
    .post(function(req, res) {

        var students = new Students();      // create a new instance of the Students model
        students.name = req.body.name;  // set the students name (comes from the request)

        students.save(function(err) {
            if (err)
                res.send(err);

            res.json({ message: 'Students created!' });
        });


    })

    // get all the students (accessed at GET http://localhost:8080/api/students)
    .get(function(req, res) {
        Students.find(function(err, students) {
            if (err)
                res.send(err);

            res.json(students);
        });
    });



// REGISTER OUR ROUTES -------------------------------
app.use('/api', router);

// START THE SERVER
// =============================================================================
app.listen(port);
console.log('Magic happens on port ' + port);

然后按如下方式配置maven配置文件：

<dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-core_2.11</artifactId>
        <version>2.2.1</version>
        <scope>${spark.scope}</scope>
    </dependency>
    <dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-sql-kafka-0-10_2.11</artifactId>
        <version>2.2.1</version>
    </dependency>

添加以下插件：

<profiles>
    <profile>
        <id>default</id>
        <properties>
            <profile.id>dev</profile.id>
            <spark.scope>compile</spark.scope>
        </properties>
        <activation>
            <activeByDefault>true</activeByDefault>
        </activation>
    </profile>
    <profile>
        <id>test</id>
        <properties>
            <profile.id>test</profile.id>
            <spark.scope>provided</spark.scope>
        </properties>
    </profile>
    <profile>
        <id>online</id>
        <properties>
            <profile.id>online</profile.id>
            <spark.scope>provided</spark.scope>
        </properties>
    </profile>
</profiles>

然后使用<plugin> <artifactId>maven-assembly-plugin</artifactId> <version>3.1.0</version> <configuration> <descriptorRefs> <descriptorRef>jar-with-dependencies</descriptorRef> </descriptorRefs> </configuration> <executions> <execution> <id>make-assembly</id>  <phase>package</phase>  <goals> <goal>single</goal> </goals> </execution> </executions> </plugin>来制作你的jar。这应该可以解决您的问题

Answer 4

您可以使用完全限定名称（而不是别名）的kafka数据源，如下所示：

spark.readStream.format("org.apache.spark.sql.kafka010.KafkaSourceProvider").load

为什么执行结构化流应用程序失败并显示“无法找到数据源：kafka”？

4 个答案: