【spark】local模式运行
maven依赖
<properties>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
<scala.binary.version>2.11</scala.binary.version>
<scala.version>2.11.12</scala.version>
<spark.version>2.4.1</spark.version>
<parser.combinators.version>1.1.1</parser.combinators.version>
<hadoop.version>2.10.0</hadoop.version>
</properties>
<dependencies>
<!-- scala -->
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>${scala.version}</version>
</dependency>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-compiler</artifactId>
<version>${scala.version}</version>
</dependency>
<dependency>
<groupId>org.scala-lang.modules</groupId>
<artifactId>scala-parser-combinators_${scala.binary.version}</artifactId>
<version>${parser.combinators.version}</version>
</dependency>
<!-- spark -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-mllib_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
</exclusions>
</dependency>
<!-- hadoop -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>3.1.4</version>
<executions>
<!-- Run scala compiler in the process-resources phase, so that dependencies on
scala classes can be resolved later in the (Java) compile phase -->
<execution>
<id>scala-compile-first</id>
<phase>process-resources</phase>
<goals>
<goal>compile</goal>
</goals>
</execution>
<!-- Run scala compiler in the process-test-resources phase, so that dependencies on
scala classes can be resolved later in the (Java) test-compile phase -->
<execution>
<id>scala-test-compile</id>
<phase>process-test-resources</phase>
<goals>
<goal>testCompile</goal>
</goals>
</execution>
<execution>
<id>scala-add-source</id>
<phase>package</phase>
<goals>
<goal>add-source</goal>
</goals>
</execution>
</executions>
<configuration>
<jvmArgs>
<jvmArg>-Xms128m</jvmArg>
<jvmArg>-Xmx512m</jvmArg>
</jvmArgs>
<addScalacArgs>-target:jvm-1.8</addScalacArgs>
</configuration>
</plugin>
</plugins>
</build>
spark代码:
package cn.zwy.hdfs import org.apache.spark.SparkConf import org.apache.spark.sql.SparkSession object Main { def main(args: Array[String]): Unit = { val sparkConf:SparkConf = new SparkConf() sparkConf.setMaster("local[*]").setAppName("csvHdfsTest") val ss = SparkSession.builder().config(sparkConf).getOrCreate() val textFile = ss.sparkContext.textFile("hdfs://node5:8020/user/root/marayarn/upload/20210329/firewall.log") val counts = textFile.flatMap(line => line.split(" ")) .map(word => (word, 1)) .reduceByKey(_ + _) counts.foreach(println) } }
另外一个问题
当hdfs配置了高可用时,会出现 java.net.UnknownHostException 异常,解决方案是将hdfs-site.xml加到resource下或者configuration.addResource(new Path("/root/hdfs-site.xml"));
跟踪定位了下,是因为FileSystem在serviceLoad阶段加载出HdfsConfiguration时触发了Configuration.addDefaultResource("hdfs-site.xml");
而Configuration.addDefaultResource只能加载classpath下的资源文件。所以这里也没办法通过环境变量的方式将hdfs-site.xml注入进来。

浙公网安备 33010602011771号