一、依赖
maven依赖
<?xml version="1.0" encoding="UTF-8"?> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>com.shujia</groupId> <artifactId>spark</artifactId> <version>1.0</version> <properties> <maven.compiler.source>8</maven.compiler.source> <maven.compiler.target>8</maven.compiler.target> </properties> <dependencies> <dependency> <groupId>org.scala-lang</groupId> <artifactId>scala-library</artifactId> <version>2.11.12</version> </dependency> <dependency> <groupId>org.scala-lang</groupId> <artifactId>scala-compiler</artifactId> <version>2.11.12</version> </dependency> <dependency> <groupId>org.scala-lang</groupId> <artifactId>scala-reflect</artifactId> <version>2.11.12</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-core_2.11</artifactId> <version>2.4.5</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-sql_2.11</artifactId> <version>2.4.5</version> </dependency> <dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <version>5.1.49</version> </dependency> <dependency> <groupId>org.apache.hive</groupId> <artifactId>hive-exec</artifactId> <version>1.2.1</version> </dependency> <dependency> <groupId>org.apache.hive</groupId> <artifactId>hive-jdbc</artifactId> <version>1.2.1</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-streaming_2.11</artifactId> <version>2.4.5</version> </dependency> <dependency> <groupId>com.alibaba</groupId> <artifactId>fastjson</artifactId> <version>1.2.79</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-mllib_2.11</artifactId> <version>2.4.5</version> </dependency> <dependency> <groupId>com.janeluo</groupId> <artifactId>ikanalyzer</artifactId> <version>2012_u6</version> </dependency> </dependencies> <build> <plugins> <!-- Java Compiler --> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-compiler-plugin</artifactId> <version>3.1</version> <configuration> <source>1.8</source> <target>1.8</target> </configuration> </plugin> <!-- Scala Compiler --> <plugin> <groupId>org.scala-tools</groupId> <artifactId>maven-scala-plugin</artifactId> <version>2.15.2</version> <executions> <execution> <goals> <goal>compile</goal> <goal>testCompile</goal> </goals> </execution> </executions> </plugin> </plugins> </build> </project>
二、word count
样例:
java,spark,hadoop
java,spark
java,spark,hadoop,spark
java,spark,hadoop
java,spark,hadoop,hadoop,spark
java,spark,hadoop,hadoop
java,spark,hadoop
java,spark,hadoop,spark
java
java,spark,hadoop
package com.shujia.spark.core import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} object Demo1WordCount { def main(args: Array[String]): Unit = { /** * 1、创建spark环境 * */ //spark环境配置对象 val conf = new SparkConf() //设置spark任务的名称 conf.setAppName("Demo1WordCount") //设置spark运行模式,local:本地运行 conf.setMaster("local") //创建spark上下文对象,sc是spark写代码的入口 val sc = new SparkContext(conf) /** * 2、读取文件 * spark读取文件底层的代码和MapReduce是一样的 * 所以切片规则一样 * spark是一个切片对应一个分区 * * * RDD:弹性的分布式数据集,现阶段可以当初list集合使用 * */ //如果是在集群运行,就是读取hdfs的文件 val linesRDD: RDD[String] = sc.textFile("data/words.txt") /** * 3、将单词展开 * */ val wordsRDD: RDD[String] = linesRDD.flatMap(line => line.split(",")) /** * 4、按照单词分组 * */ val kvRDD: RDD[(String, Iterable[String])] = wordsRDD.groupBy(word => word) /** * 5、统计单词的数量 * */ val wordCount: RDD[String] = kvRDD.map { case (word: String, iter: Iterable[String]) => val count: Int = iter.size s"$word\t$count" } /** * 6、保持数据 * */ wordCount.saveAsTextFile("data/wordcount") } }
输出结果:
spark 12
hadoop 10
java 10