之前为了搭建scala开发spark的环境花了几天的时间,终于搞定了,具体可以参考:http://www.cnblogs.com/ljy2013/p/4964201.html 。下面就是用一个示例来测试自己的开发环境了,于是就只用了大数据比较经典的例子:WordCount。下面详细说明一下:
1、首先安装之前搭建的环境,创建maven工程来写scala的代码。工程目录如下:
2、编写代码
package com.yiban.datacenter.Spark_demo
import org.apache.spark.SparkContext
import org.apache.spark.SparkConf
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.FileSystem
/**
* @author ${user.name}
*/
object App {
def foo(x : Array[String]) = x.foldLeft("")((a,b) => a + b)
def main(args : Array[String]) {
//hadoop configuration 没有这个在local模式下会报错
val hadoopconf = new Configuration();
hadoopconf.setBoolean("fs.hdfs.impl.disable.cache", true);
val fileSystem = FileSystem.get(hadoopconf);
//spark configuration
val conf = new SparkConf().setAppName("wordcount").setMaster("yarn-cluster") //这里采用yarn集群的方式运行
val sc = new SparkContext(conf)
val wordcount=sc.textFile("/user/liujiyu/input", 1).flatMap(_.split(" ")).map(word=>(word,1)).reduceByKey(_+_).saveAsTextFile("/user/liujiyu/sparkwordcountoutput")
val data = Array(1, 2, 3, 4, 5)
val data2=Seq(1,2,3)
val distData = sc.parallelize(data)
distData.saveAsTextFile("/user/liujiyu/spark-demo")
}
}
3、pom.xml文件内容如下:
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.yiban.datacenter</groupId>
<artifactId>Spark-demo</artifactId>
<version>0.0.1-SNAPSHOT</version>
<name>${project.artifactId}</name>
<description>My wonderfull scala app</description>
<inceptionYear>2015</inceptionYear>
<licenses>
<license>
<name>My License</name>
<url>http://....</url>
<distribution>repo</distribution>
</license>
</licenses>
<properties>
<maven.compiler.source>1.6</maven.compiler.source>
<maven.compiler.target>1.6</maven.compiler.target>
<encoding>UTF-8</encoding>
<scala.version>2.10.5</scala.version>
<scala.compat.version>2.10</scala.compat.version>
</properties>
<repositories>
<repository>
<id>cloudera-repo-releases</id>
<url>https://repository.cloudera.com/artifactory/repo/</url>
</repository>
</repositories>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId> spark-core_2.10</artifactId>
<version>1.5.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId> hadoop-client</artifactId>
<version>2.6.0-cdh5.4.4</version>
</dependency>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>${scala.version}</version>
</dependency>
<!-- Test -->
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.specs2</groupId>
<artifactId>specs2-core_${scala.compat.version}</artifactId>
<version>2.4.16</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.scalatest</groupId>
<artifactId>scalatest_${scala.compat.version}</artifactId>
<version>2.2.4</version>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<sourceDirectory>src/main/scala</sourceDirectory>
<testSourceDirectory>src/test/scala</testSourceDirectory>
<plugins>
<plugin>
<!-- see http://davidb.github.com/scala-maven-plugin -->
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>3.2.0</version>
<executions>
<execution>
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
<configuration>
<args>
<arg>-make:transitive</arg>
<arg>-dependencyfile</arg>
<arg>${project.build.directory}/.scala_dependencies</arg>
</args>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.18.1</version>
<configuration>
<useFile>false</useFile>
<disableXmlReport>true</disableXmlReport>
<!-- If you have classpath issue like NoDefClassError,... -->
<!-- useManifestOnlyJar>false</useManifestOnlyJar -->
<includes>
<include>**/*Test.*</include>
<include>**/*Suite.*</include>
</includes>
</configuration>
</plugin>
</plugins>
</build>
</project>
4、执行maven clean package 对工程进行打包。
5、将对应打包好的文件放到集群上去运行
执行如下命令进行运行:
spark-submit --class "com.yiban.datacenter.Spark_demo.App" --master yarn-cluster Spark-demo-0.0.1-SNAPSHOT.jar
运行结束,会在对应路径产生结果,查看hdfs对应路径结果即可。
浙公网安备 33010602011771号