SparkStreaming(三)——自定义数据源
从socketTextStream说起
查看socketTextStream源码
def socketTextStream( hostname: String, port: Int, storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER_2 ): ReceiverInputDStream[String] = withNamedScope("socket text stream") { socketStream[String](hostname, port, SocketReceiver.bytesToLines, storageLevel) } //socketTextStream底层调用 socketStream def socketStream[T: ClassTag]( hostname: String, port: Int, converter: (InputStream) => Iterator[T], storageLevel: StorageLevel ): ReceiverInputDStream[T] = { new SocketInputDStream[T](this, hostname, port, converter, storageLevel) } // socketStream 底层又调用 SocketInputDStream //该类有两个方法,onstart,onstop class SocketInputDStream[T: ClassTag]( _ssc: StreamingContext, host: String, port: Int, bytesToObjects: InputStream => Iterator[T], storageLevel: StorageLevel ) extends ReceiverInputDStream[T](_ssc) { def onStart() { logInfo(s"Connecting to $host:$port") try { socket = new Socket(host, port)//创建socket } catch { case e: ConnectException => restart(s"Error connecting to $host:$port", e) return } logInfo(s"Connected to $host:$port") // Start the thread that receives data over a connection new Thread("Socket Receiver") { setDaemon(true)//设置为守护线程,长期运行?? override def run() { receive() }//运行receive() }.start() } //onstart()就是创建一个socket连接,创建一个守护线程,运行receive方法 def onStop() { // in case restart thread close it twice synchronized { if (socket != null) { socket.close()//关闭socket socket = null logInfo(s"Closed socket to $host:$port") } } } //onstop就是关闭socket连接
/** Create a socket connection and receive data until receiver is stopped */
def receive() {
try {
val iterator = bytesToObjects(socket.getInputStream())//将接受到的数据转换为iterator对象
while(!isStopped && iterator.hasNext) {//循环终止的条件
store(iterator.next())//将该iterator对象的内容存入spark的内存
}
if (!isStopped()) {
restart("Socket data stream had no more data")
} else {
logInfo("Stopped receiving")
}
} catch {
case NonFatal(e) =>
logWarning("Error receiving data", e)
restart("Error receiving data", e)
} finally {
onStop()
}
}
}
/**
* Store a single item of received data to Spark's memory.
* These single items will be aggregated together into data blocks before
* being pushed into Spark's memory.
数据会被组装成数据块,然后送入Sparks 的内存
*/
def store(dataItem: T) {
supervisor.pushSingle(dataItem)
}
因此,可以参考上面的原码,设计自己的数据源,实现对远程主机特定端口的持续监控,并获取数据。需要实现Receiver抽象类,并实现抽象方法onStart和onStop
package sparkstreaming import java.io.{BufferedReader, InputStreamReader} import java.net.Socket import java.nio.charset.StandardCharsets import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.receiver.Receiver class CustomerReceiver(host: String, port: Int) extends Receiver[String](StorageLevel.MEMORY_ONLY) { var socket : Socket = null //最初启动的时候,调用该方法,作用为:开启守护线程,读取数据 override def onStart(): Unit = { new Thread("Socket Receiver") { override def run() { receive() } }.start() } //读数据并将数据发送给Spark def receive(): Unit = { //创建一个Socket socket = new Socket(host, port) //定义一个变量,用来接收端口传过来的数据 var input: String = null //创建一个BufferedReader用于读取端口传来的数据 val reader = new BufferedReader(new InputStreamReader(socket.getInputStream, StandardCharsets.UTF_8)) //读取数据 input = reader.readLine() //当receiver没有关闭并且没有收到结束标识,则循环发送数据给Spark //isStopped()方法由receiver提供,用于监测receiver的状态 while (!isStopped() && input != null) { store(input)//将数据送入spark内存,该方法由receiver提供 input = reader.readLine() } //跳出循环则关闭资源 reader.close() socket.close() //重启任务 restart("restart") } override def onStop(): Unit = { synchronized { if (socket != null) { socket.close()//关闭socket socket = null } } } }
测试代码如下,只需要改变数据源,其他的处理逻辑照旧:
package sparkstreaming import org.apache.spark.SparkConf import org.apache.spark.streaming.{Seconds, StreamingContext} object Demo1 { def main(args: Array[String]): Unit = { //1.初始化Spark配置信息 val sparkConf = new SparkConf().setMaster("local[*]") .setAppName("StreamWordCount") //2.初始化SparkStreamingContext val ssc = new StreamingContext(sparkConf, Seconds(5)) //3.创建自定义receiver的Streaming val lineStream = ssc.receiverStream(new CustomerReceiver("chxy001", 9999)) //4.将每一行数据做切分,形成一个个单词 val wordStreams = lineStream.flatMap(_.split("\t")) //5.将单词映射成元组(word,1) val wordAndOneStreams = wordStreams.map((_, 1)) //6.将相同的单词次数做统计 val wordAndCountStreams = wordAndOneStreams.reduceByKey(_ + _) //7.打印 wordAndCountStreams.print() //8.启动SparkStreamingContext ssc.start() ssc.awaitTermination() } }
可以实时打印出远程主机传递的数据!

浙公网安备 33010602011771号