我正在运行Spark代码将数据保存到Amazon EMR 5.8.0中安装了Spark 2.2.0的HBase中。 在IntelliJ中运行时,它可以正常工作,但是在EMR集群中,它会抛出以下错误:
java.lang.NoClassDefFoundError: org/apache/spark/sql/DataFrame
代码
val zookeeperQuorum = args(0)
val tableName = args(1)
val inputPath = args(2)
val spark = SparkSession.builder
.appName("PhoenixSpark")
.getOrCreate
val df = spark.read
.option("delimiter", "\001")
.csv(inputPath)
val hBaseDf = spark.read
.format("org.apache.phoenix.spark")
.option("table", tableName)
.option("zkUrl", zookeeperQuorum)
.load()
val tableSchema = hBaseDf.schema
val rowKeyDf = df.withColumn("row_key", concat(col("_c3"), lit("_"), col("_c5"), lit("_"), col("_c0")))
rowKeyDf.createOrReplaceTempView("mytable")
val correctedDf = spark.sql("Select row_key, _c0, _c1, _c2, _c3, _c4, _c5, _c6, _c7," +
"_c8, _c9, _c10, _c11, _c12, _c13, _c14, _c15, _c16, _c17, _c18, _c19 from mytable")
val rdd = correctedDf.rdd
val finalDf= spark.createDataFrame(rdd, tableSchema)
finalDf.write
.format("org.apache.phoenix.spark")
.mode("overwrite")
.option("table", tableName)
.option("zkUrl", zookeeperQuorum)
.save()
spark.stop()
我的pom.xml文件正确地指定了Spark版本为2.2.0。
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.myntra.analytics</groupId>
<artifactId>com.myntra.analytics</artifactId>
<version>1.0-SNAPSHOT</version>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
<plugin>
<groupId>org.scala-tools</groupId>
<artifactId>maven-scala-plugin</artifactId>
<executions>
<execution>
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
</plugin>
<!-- "package" command plugin -->
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<version>2.6</version>
<configuration>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.2.0</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.2.0</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.11</artifactId>
<version>2.2.0</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.phoenix</groupId>
<artifactId>phoenix-spark</artifactId>
<version>4.11.0-HBase-1.3</version>
<scope>provided</scope>
</dependency>
</dependencies>
<pluginRepositories>
<pluginRepository>
<id>scala-tools.org</id>
<name>Scala-tools Maven2 Repository</name>
<url>http://scala-tools.org/repo-releases</url>
</pluginRepository>
</pluginRepositories>
以下是来自 EMR 日志的堆栈跟踪,显示了此错误。
17/09/28 23:20:18 ERROR ApplicationMaster: User class threw exception:
java.lang.NoClassDefFoundError: org/apache/spark/sql/DataFrame
java.lang.NoClassDefFoundError: org/apache/spark/sql/DataFrame
at java.lang.Class.getDeclaredMethods0(Native Method)
at java.lang.Class.privateGetDeclaredMethods(Class.java:2701)
at java.lang.Class.getDeclaredMethod(Class.java:2128)
at java.io.ObjectStreamClass.getPrivateMethod(ObjectStreamClass.java:1475)
at java.io.ObjectStreamClass.access$1700(ObjectStreamClass.java:72)
at java.io.ObjectStreamClass$2.run(ObjectStreamClass.java:498)
at java.io.ObjectStreamClass$2.run(ObjectStreamClass.java:472)
at java.security.AccessController.doPrivileged(Native Method)
at java.io.ObjectStreamClass.<init>(ObjectStreamClass.java:472)
at java.io.ObjectStreamClass.lookup(ObjectStreamClass.java:369)
at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1134)
at java.io.ObjectOutputStream.defaultWriteFields(ObjectOutputStream.java:1548)
at java.io.ObjectOutputStream.writeSerialData(ObjectOutputStream.java:1509)
at java.io.ObjectOutputStream.writeOrdinaryObject(ObjectOutputStream.java:1432)
at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1178)
at java.io.ObjectOutputStream.writeObject(ObjectOutputStream.java:348)
at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:43)
at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:100)
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:295)
at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:288)
at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:108)
at org.apache.spark.SparkContext.clean(SparkContext.scala:2287)
at org.apache.spark.rdd.RDD$$anonfun$map$1.apply(RDD.scala:370)
at org.apache.spark.rdd.RDD$$anonfun$map$1.apply(RDD.scala:369)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
at org.apache.spark.rdd.RDD.map(RDD.scala:369)
at org.apache.phoenix.spark.PhoenixRDD.toDataFrame(PhoenixRDD.scala:131)
at org.apache.phoenix.spark.PhoenixRelation.schema(PhoenixRelation.scala:60)
at org.apache.spark.sql.execution.datasources.LogicalRelation$.apply(LogicalRelation.scala:77)
at org.apache.spark.sql.SparkSession.baseRelationToDataFrame(SparkSession.scala:415)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:172)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:146)
at com.mynra.analytics.chronicles.PhoenixSpark$.main(PhoenixSpark.scala:29)
at com.mynra.analytics.chronicles.PhoenixSpark.main(PhoenixSpark.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.spark.deploy.yarn.ApplicationMaster$$anon$2.run(ApplicationMaster.scala:635)
Caused by: java.lang.ClassNotFoundException: org.apache.spark.sql.DataFrame
at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:335)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
... 41 more
import org.apache.spark.sql.DataFrame
- Shaido