Hadoop生态相关组件配置记录

环境变量

cat /etc/profile.d/my_env.sh

#JAVA_HOME
export JAVA_HOME=/opt/module/jdk1.8.0_181
export CLASSPATH=$JAVA_HOME/lib/dt.jar:$JAVA_HOME/lib/tools.jar

export HADOOP_HOME=/opt/module/hadoop-3.2.2

#export SCALA_HOME=/opt/module/scala-2.13.0
export SCALA_HOME=/opt/module/scala-2.12.11

export HIVE_HOME=/opt/module/apache-hive-3.1.2-bin

export ZOOKEEPER_HOME=/opt/module/apache-zookeeper-3.6.3-bin

# export SPARK_HOME=/opt/module/spark-3.2.0-bin-hadoop3.2-scala2.13
export SPARK_HOME=/opt/module/spark-3.2.0-bin-hadoop3.2

export KAFKA_HOME=/opt/module/kafka_2.11-2.4.1

export MYBIN_HOME=/opt/module/bin

export FLUME_HOME=/opt/module/flume-1.9.0

export SQOOP_HOME=/opt/module/sqoop-1.4.6

export PATH=$SQOOP_HOME/bin:$FLUME_HOME/bin:$KAFKA_HOME/bin:$MYBIN_HOME:$SPARK_HOME/sbin:$SCALA_HOME/bin:$HIVE_HOME/bin:$ZOOKEEPER_HOME/bin:$SPARK_HOME/bin:$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$JAVA_HOME/bin:$PATH

组件启停脚本

#!/bin/bash
if [ $# -lt 1 ]
then
    echo "No Args Input..."
    exit ;
fi
case $1 in
"start")
        echo " =================== 启动 zookeeper  ==================="
        ssh hadoop001 "zkServer.sh start"
        ssh hadoop002 "zkServer.sh start"
        ssh hadoop003 "zkServer.sh start"

        echo " =================== 启动 hadoop集群 ==================="

        echo " --------------- 启动 hdfs ---------------"
        ssh hadoop001 "/opt/module/hadoop-3.2.2/sbin/start-dfs.sh"
        echo " --------------- 启动 yarn ---------------"
        ssh hadoop001 "/opt/module/hadoop-3.2.2/sbin/start-yarn.sh"
        echo " --------------- 启动 historyserver ---------------"
        ssh hadoop001 "/opt/module/hadoop-3.2.2/bin/mapred --daemon start historyserver"
        
        echo " =================== 启动 kafka集群 ==================="
        ssh hadoop001 "kafka-server-start.sh -daemon /opt/module/kafka_2.11-2.4.1/config/server.properties"
        ssh hadoop002 "kafka-server-start.sh -daemon /opt/module/kafka_2.11-2.4.1/config/server.properties"
        ssh hadoop003 "kafka-server-start.sh -daemon /opt/module/kafka_2.11-2.4.1/config/server.properties"

;;
"stop")
        echo " =================== 关闭 kafka集群 ==================="
        ssh hadoop001 "kafka-server-stop.sh -daemon /opt/module/kafka_2.11-2.4.1/config/server.properties"
        ssh hadoop002 "kafka-server-stop.sh -daemon /opt/module/kafka_2.11-2.4.1/config/server.properties"
        ssh hadoop003 "kafka-server-stop.sh -daemon /opt/module/kafka_2.11-2.4.1/config/server.properties"

        echo " =================== 关闭 hadoop集群 ==================="

        echo " --------------- 关闭 historyserver ---------------"
        ssh hadoop001 "/opt/module/hadoop-3.2.2/bin/mapred --daemon stop historyserver"
        echo " --------------- 关闭 yarn ---------------"
        ssh hadoop001 "/opt/module/hadoop-3.2.2/sbin/stop-yarn.sh"
        echo " --------------- 关闭 hdfs ---------------"
        ssh hadoop001 "/opt/module/hadoop-3.2.2/sbin/stop-dfs.sh"

        echo " =================== 关闭 zookeeper  ==================="
        ssh hadoop001 "zkServer.sh stop"
        ssh hadoop002 "zkServer.sh stop"
        ssh hadoop003 "zkServer.sh stop"
;;
*)
    echo "Input Args Error..."
;;
esac

hadoop的配置文件

hadoop版本:3.2.2
core-site.xml

<configuration>
<!-- HDFS主入口,mycluster仅是作为集群的逻辑名称,可随意更改但务必与
     hdfs-site.xml中dfs.nameservices值保持一致-->
    <property>
        <name>fs.defaultFS</name>
        <value>hdfs://mycluster</value>
    </property>

<!-- 默认的hadoop.tmp.dir指向的是/tmp目录,将导致namenode与datanode>数据全都保存在易失目录中
,此处进行修改-->
    <property>
        <name>hadoop.tmp.dir</name>
        <value>/opt/module/hadoop-3.2.2/tmp</value>
    </property>

<!--用户角色配置,不配置此项会导致web页面报错-->
    <property>
        <name>hadoop.http.staticuser.user</name>
        <value>root</value>
    </property>

<!--zookeeper集群地址,这里可配置单台,如是集群以逗号进行分隔-->
    <property>
        <name>ha.zookeeper.quorum</name>
        <value>hadoop001:2181,hadoop002:2181,hadoop003:2181</value>
    </property>
    <!-- hadoop链接zookeeper的超时时长设置 -->
    <property>
        <name>ha.zookeeper.session-timeout.ms</name>
        <value>1000</value>
        <description>ms</description>
    </property>
<property>
<name>hadoop.proxyuser.root.hosts</name>
<value>*</value>、
</property>

<property>
<name>hadoop.proxyuser.root.groups</name>
<value>*</value>
</property>
    <property>
        <name>io.compression.codecs</name>
        <value>
            org.apache.hadoop.io.compress.GzipCodec,
            org.apache.hadoop.io.compress.DefaultCodec,
            org.apache.hadoop.io.compress.BZip2Codec,
            org.apache.hadoop.io.compress.SnappyCodec,
            com.hadoop.compression.lzo.LzoCodec,
            com.hadoop.compression.lzo.LzopCodec
        </value>
    </property>

    <property>
        <name>io.compression.codec.lzo.class</name>
        <value>com.hadoop.compression.lzo.LzoCodec</value>
    </property>

</configuration>

hdfs-site.xml

<configuration>
<property>
<name>dfs.datanode.data.dir</name>
<value>/opt/module/hadoop-3.2.2/data</value>
</property>
<property>
<name>dfs.namenode.name.dir</name>
<value>/opt/module/hadoop-3.2.2/nn</value>
</property>
<property>
<name>dfs.webhdfs.enabled</name>
<value>true</value>
</property>
<property>
<name>dfs.replication</name>
<value>2</value>
</property>

<property>
<name>dfs.permissions</name>
<value>false</value>
</property>


<!--指定hdfs的nameservice为cluster1,需要和core-site.xml中的保持一致
                      dfs.ha.namenodes.[nameservice id]为在nameservice中的每一个NameNode设置唯一标示符。
        配置一个逗号分隔的NameNode ID列表。这将是被DataNode识别为所有的NameNode。
        例如,如果使用"cluster1"作为nameservice ID,并且使用"nn1"和"nn2"作为NameNodes标示符
    -->
    <property>
        <name>dfs.nameservices</name>
        <value>mycluster</value>
    </property>
        <property>
        <name>dfs.ha.namenodes.mycluster</name>
        <value>nn1,nn2</value>
    </property>

   <!-- nn1的RPC通信地址 -->
    <property>
        <name>dfs.namenode.rpc-address.mycluster.nn1</name>
        <value>hadoop001:9000</value>
    </property>

    <!-- nn1的http通信地址 -->
    <property>
        <name>dfs.namenode.http-address.mycluster.nn1</name>
        <value>hadoop001:9870</value>
    </property>

    <!-- nn2的RPC通信地址 -->
    <property>
        <name>dfs.namenode.rpc-address.mycluster.nn2</name>
        <value>hadoop002:9000</value>
    </property>

    <!-- nn2的http通信地址 -->
    <property>
        <name>dfs.namenode.http-address.mycluster.nn2</name>
        <value>hadoop002:9870</value>
    </property>

    <!-- 指定NameNode的edits元数据的共享存储位置。也就是JournalNode列表
                          该url的配置格式:qjournal://host1:port1;host2:port2;host3:port3/journalId
        journalId推荐使用nameservice,默认端口号是:8485 -->
    <property>
        <name>dfs.namenode.shared.edits.dir</name>
        <value>qjournal://hadoop001:8485;hadoop002:8485;hadoop003:8485/mycluster</value>
    </property>

    <!-- 指定JournalNode在本地磁H的位置 -->
    <property>
        <name>dfs.journalnode.edits.dir</name>
        <value>/opt/module/hadoop-3.2.2/jn</value>
    </property>

    <!-- 开启NameNode失败自动切换 -->
    <property>
        <name>dfs.ha.automatic-failover.enabled</name>
        <value>true</value>
    </property>

    <!-- 配置失败自动切换实现方式 -->
    <property>
        <name>dfs.client.failover.proxy.provider.mycluster</name>
        <value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
    </property>

    <!-- 配置隔离机制方法,多个机制用换行分割,即每个机制暂用一行 -->
    <property>
        <name>dfs.ha.fencing.methods</name>
        <value>
            sshfence
            shell(/bin/true)
        </value>
    </property>

    <!-- 使用sshfence隔离机制时需要ssh免登陆 -->
    <property>
        <name>dfs.ha.fencing.ssh.private-key-files</name>
        <value>/root/.ssh/id_rsa</value>
    </property>

    <!-- 配置sshfence隔离机制超时时间 -->
    <property>
        <name>dfs.ha.fencing.ssh.connect-timeout</name>
        <value>30000</value>
    </property>

    <property>
        <name>ha.failover-controller.cli-check.rpc-timeout.ms</name>
        <value>60000</value>
    </property>
    
    
    <!--指定辅助名称节点-->
    <property>
        <name>dfs.namenode.secondary.http-address</name>
        <value>hadoop003:9868</value>
    </property>
    
    <property>
        <name>dfs.namenode.handler.count</name>
        <value>10</value>
    </property>

</configuration>

hadoop-env.sh

export HDFS_NAMENODE_USER=root
export HDFS_DATANODE_USER=root
export HDFS_ZKFC_USER=root
export HDFS_JOURNALNODE_USER=root

yarn-env.sh

YARN_RESOURCEMANAGER_USER=root
YARN_NODEMANAGER_USER=root

mapred-site.xml

<configuration>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
<!-- 配置 MapReduce JobHistory Server 地址 ,默认端口10020 -->
        <property>
                <name>mapreduce.jobhistory.address</name>
                <value>hadoop001:10020</value>
        </property>

        <!-- 配置 MapReduce JobHistory Server web ui 地址, 默认端口19888 -->
        <property>
                <name>mapreduce.jobhistory.webapp.address</name>
                <value>hadoop001:19888</value>
        </property>
</configuration>

yarn-site.xml

<configuration>
<!-- 开启RM高可用 -->
    <property>
        <name>yarn.resourcemanager.ha.enabled</name>
        <value>true</value>
    </property>

    <!-- 指定RM的cluster id -->
    <property>
        <name>yarn.resourcemanager.cluster-id</name>
        <value>yrc</value>
    </property>

    <!-- 指定RM的名字 -->
    <property>
        <name>yarn.resourcemanager.ha.rm-ids</name>
        <value>rm1,rm2</value>
    </property>

    <!-- 分别指定RM的地址 -->
    <property>
        <name>yarn.resourcemanager.hostname.rm1</name>
        <value>hadoop001</value>
    </property>

    <property>
        <name>yarn.resourcemanager.hostname.rm2</name>
        <value>hadoop002</value>
    </property>

<property>
    <!-- RM HTTP访问地址 默认:${yarn.resourcemanager.hostname}:8088-->
    <name>yarn.resourcemanager.webapp.address.rm1</name>
    <value>hadoop001:8088</value>
</property>
<property>
    <!-- RM HTTP访问地址 默认:${yarn.resourcemanager.hostname}:8088-->
    <name>yarn.resourcemanager.webapp.address.rm2</name>
    <value>hadoop002:8088</value>
</property>


    <!-- 指定zk集群地址 -->
    <property>
        <name>yarn.resourcemanager.zk-address</name>
        <value>hadoop001:2181,hadoop002:2181,hadoop002:2181</value>
    </property>
<!--Reducer获取数据的方式-->
    <property>
        <name>yarn.nodemanager.aux-services</name>
        <value>mapreduce_shuffle</value>
    </property>
<!--日志聚集功能开启-->
    <property>
        <name>yarn.log-aggregation-enable</name>
        <value>true</value>
    </property>
<!--日志保留时间设置1天-->
    <property>
        <name>yarn.log-aggregation.retain-seconds</name>
        <value>86400</value>
    </property>

    <!-- 启用自动恢复 -->
    <property>
        <name>yarn.resourcemanager.recovery.enabled</name>
        <value>true</value>
    </property>

    <!-- 制定resourcemanager的状态信息存储在zookeeper集群上 -->
    <property>
        <name>yarn.resourcemanager.store.class</name>
        <value>org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore</value>
    </property>

<!-- 环境变量的继承 -->
    <property>
        <name>yarn.nodemanager.env-whitelist</name>
        <value>JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_MAPRED_HOME</value>
    </property>
    
    <!-- yarn容器允许分配的最大最小内存 -->
    <property>
        <name>yarn.scheduler.minimum-allocation-mb</name>
        <value>512</value>
    </property>
    <property>
        <name>yarn.scheduler.maximum-allocation-mb</name>
        <value>4096</value>
    </property>
    
    <!-- yarn容器允许管理的物理内存大小 -->
    <property>
        <name>yarn.nodemanager.resource.memory-mb</name>
        <value>4096</value>
    </property>
    
    <!-- 关闭yarn对物理内存和虚拟内存的限制检查 -->
    <property>
        <name>yarn.nodemanager.pmem-check-enabled</name>
        <value>false</value>
    </property>
    <property>
        <name>yarn.nodemanager.vmem-check-enabled</name>
        <value>false</value>
    </property>
</configuration>

zookeeper配置文件

zoo.cfg

tickTime=2000
initLimit=10
syncLimit=5
dataDir=/opt/module/apache-zookeeper-3.6.3-bin/data
dtaLogDir=/opt/module/apache-zookeeper-3.6.3-bin/dataLog
clientPort=2181
autopurge.snapRetainCount=20
autopurge.purgeInterval=48


server.1=hadoop001:2888:3888
server.2=hadoop002:2888:3888
server.3=hadoop003:2888:3888

每台主机配置myid

/opt/module/apache-zookeeper-3.6.3-bin/data/myid

kafka配置文件

版本:kafka_2.11-2.4.1
server.properties




broker.id=1

num.network.threads=3
num.io.threads=8

socket.send.buffer.bytes=102400

socket.receive.buffer.bytes=102400

socket.request.max.bytes=104857600

log.dirs=/opt/module/kafka_2.11-2.4.1/datas

num.partitions=1

num.recovery.threads.per.data.dir=1

offsets.topic.replication.factor=1
transaction.state.log.replication.factor=1
transaction.state.log.min.isr=1

log.retention.hours=168

log.segment.bytes=1073741824

log.retention.check.interval.ms=300000

zookeeper.connect=hadoop001:2181,hadoop002:2181,hadoop003:2181/kafka

zookeeper.connection.timeout.ms=6000

group.initial.rebalance.delay.ms=0

启停

kafka-server-start.sh -daemon /opt/module/kafka_2.11-2.4.1/config/server.properties
kafka-server-stop.sh -daemon /opt/module/kafka_2.11-2.4.1/config/server.properties

使用

kafka-topics.sh --create --topic test --bootstrap-server hadoop001:9092 --partitions 2 --replication-factor 3
kafka-topics.sh --list --bootstrap-server hadoop001:9092
kafka-topics.sh --describe --bootstrap-server hadoop001:9092 --topic test
生产者
kafka-console-producer.sh --topic test --broker-list hadoop001:9092
消费者
kafka-console-consumer.sh --topic test --bootstrap-server hadoop001:9092
按组(启动多个消费者,相同的组名,就按照分区消费了)
kafka-console-consumer.sh --topic test --bootstrap-server hadoop001:9092 --group g1

flume配置文件

版本:1.9.0
删除 guava-11.0.2.jar

flume配置: source:tailDir. channal: Kafka

/opt/module/flume-1.9.0/jobs/gmall/logserver-flume-kafka.conf

a1.sources = r1
a1.channels = c1

a1.sources.r1.type = TAILDIR
a1.sources.r1.positionFile = /opt/module/flume-1.9.0/jobs/position
a1.sources.r1.filegroups = f1
a1.sources.r1.filegroups.f1 = /opt/module/applog/log/app.*

a1.sources.r1.interceptors = i1
a1.sources.r1.interceptors.i1.type = com.atguigu.gmall.interceptor.EtlLogInterceptor$MyBuilder

a1.channels.c1.type = org.apache.flume.channel.kafka.KafkaChannel
a1.channels.c1.kafka.bootstrap.servers = hadoop001:9092,hadoop002:9092,hadoop003:9092
a1.channels.c1.kafka.topic = topic_log
a1.channels.c1.parseAsFlumeEvent = false

a1.sources.r1.channels = c1

拦截器
pom.xml

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <parent>
        <artifactId>atguigu-spark-211223</artifactId>
        <groupId>com.atguigu.bigdata</groupId>
        <version>1.0</version>
    </parent>
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.atguigu.gmall</groupId>
    <artifactId>collect0110</artifactId>

    <properties>
        <maven.compiler.source>8</maven.compiler.source>
        <maven.compiler.target>8</maven.compiler.target>
    </properties>

    <dependencies>
        <dependency>
            <groupId>org.apache.flume</groupId>
            <artifactId>flume-ng-core</artifactId>
            <version>1.9.0</version>
            <scope>provided</scope>
        </dependency>

        <dependency>
            <groupId>com.alibaba</groupId>
            <artifactId>fastjson</artifactId>
            <version>1.2.62</version>
            <scope>compile</scope>
        </dependency>
    </dependencies>

    <build>
        <plugins>
            <plugin>
                <artifactId>maven-compiler-plugin</artifactId>
                <version>2.3.2</version>
                <configuration>
                    <source>1.8</source>
                    <target>1.8</target>
                </configuration>
            </plugin>
            <plugin>
                <artifactId>maven-assembly-plugin</artifactId>
                <configuration>
                    <descriptorRefs>
                        <descriptorRef>jar-with-dependencies</descriptorRef>
                    </descriptorRefs>
                </configuration>
                <executions>
                    <execution>
                        <id>make-assembly</id>
                        <phase>package</phase>
                        <goals>
                            <goal>single</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>

</project>

Interceptor类

package com.atguigu.gmall.interceptor;

import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONException;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.interceptor.Interceptor;

import java.nio.charset.StandardCharsets;
import java.util.Iterator;
import java.util.List;

public class EtlLogInterceptor implements Interceptor {
    @Override
    public void initialize() {
    }

    @Override
    public Event intercept(Event event) {
        String body = new String(event.getBody(), StandardCharsets.UTF_8);
        try {
            JSON.parseObject(body);
        } catch (JSONException e) {
            return null;
        }
        return event;
    }

    @Override
    public List<Event> intercept(List<Event> list) {
        Iterator<Event> iterator = list.iterator();
        while (iterator.hasNext()) {
            Event next = iterator.next();
            if (intercept(next) == null) {
                iterator.remove();
            }
        }
        return list;
    }

    @Override
    public void close() {
    }

    public static class MyBuilder implements Builder {

        @Override
        public Interceptor build() {
            return new EtlLogInterceptor();
        }

        @Override
        public void configure(Context context) {
        }
    }
}

启动

flume-ng agent -c $FLUME_HOME/conf -f /opt/module/flume-1.9.0/jobs/gmall/logserver-flume-kafka.conf -n a1 -Dflume.root.logger=INFO,console

kafka消费者查看

kafka-console-consumer.sh --topic topic_log --bootstrap-server hp001:9092,hadoop002:9092,hadoop003:9092 --from-beginning

脚本

#!/bin/bash
if [ $# -lt 1 ]
then 
  echo "USAGE: f1.sh {start|stop}"
  exit
fi

case $1 in
start)
        for i in hadoop001 hadoop002
        do
          ssh $i "nohup flume-ng agent -c $FLUME_HOME/conf -f /opt/module/flume-1.9.0/jobs/gmall/logserver-flume-kafka.conf -n a1 -Dflume.root.logger=INFO,console 1>$FLUME_HOME/logs/flume.log 2>&1 &"
        done  
;;

stop)
        for i in hadoop001 hadoop002
        do
          ssh $i "ps -ef | grep logserver-flume-kafka.conf | grep -v grep | awk '{print \$2}' | xargs -n1 kill -9"
        done 
;;

*)
  echo "USAGE: f1.sh {start|stop}"
  exit
;;
esac 

flume配置: source:Kafla. channal: hdfs

拦截器

package com.atguigu.gmall.interceptor;

import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.interceptor.Interceptor;

import java.nio.charset.StandardCharsets;
import java.util.List;

public class TimeStampInterceptor implements Interceptor {
    @Override
    public void initialize() {}

    @Override
    public Event intercept(Event event) {
        String body = new String(event.getBody(), StandardCharsets.UTF_8);
        JSONObject jsonObject = JSON.parseObject(body);
        String ts = jsonObject.getString("ts");
        event.getHeaders().put("timestamp", ts);
        return event;
    }

    @Override
    public List<Event> intercept(List<Event> list) {
        for (Event event : list) {
            intercept(event);
        }
        return list;
    }

    @Override
    public void close() {}

    public static class MyBuilder implements Builder {

        @Override
        public Interceptor build() {
            return new TimeStampInterceptor();
        }

        @Override
        public void configure(Context context) {}
    }
}

kafka-flume-hdfs.conf

a1.sources = r1
a1.channels = c1
a1.sinks = k1

a1.sources.r1.type = org.apache.flume.source.kafka.KafkaSource
a1.sources.r1.kafka.bootstrap.servers = hadoop001:9092,hadoop002:9092,hadoop003:9092
a1.sources.r1.kafka.topics = topic_log
a1.sources.r1.kafka.consumer.group.id = gmall
a1.sources.r1.batchDurationMillis = 2000

a1.sources.r1.interceptors = i1
a1.sources.r1.interceptors.i1.type = com.atguigu.gmall.interceptor.TimeStampInterceptor$MyBuilder

a1.channels.c1.type = file
a1.channels.c1.dataDirs = /opt/module/flume-1.9.0/jobs/filechannel
a1.channels.c1.capacity = 1000000
a1.channels.c1.checkpointDir = /opt/module/flume-1.9.0/jobs/checkpoint
a1.channels.c1.transactionCapacity = 10000
a1.channels.c1.maxFileSize = 2146425071
a1.channels.c1.keep-alive = 5

a1.sinks.k1.type = hdfs
a1.sinks.k1.hdfs.path = /origin_data/gmall/log/topic_log/%Y-%m-%d
a1.sinks.k1.hdfs.filePrefix = log-
a1.sinks.k1.hdfs.round = false

a1.sinks.k1.hdfs.rollInterval = 10
a1.sinks.k1.hdfs.rollSize = 134217728
a1.sinks.k1.hdfs.rollCount = 0

a1.sinks.k1.hdfs.fileType = CompressedStream
a1.sinks.k1.hdfs.codeC = lzop


a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1

启停脚本

#!/bin/bash
if [ $# -lt 1 ]
then 
  echo "USAGE: f1.sh {start|stop}"
  exit
fi

case $1 in
start)
        for i in hadoop003
        do
          ssh $i "nohup flume-ng agent -c $FLUME_HOME/conf -f /opt/module/flume-1.9.0/jobs/gmall/kafka-flume-hdfs.conf -n a1 -Dflume.root.logger=INFO,console 1>$FLUME_HOME/logs/flume.log 2>&1 &"
        done  
;;

stop)
        for i in hadoop003
        do
          ssh $i "ps -ef | grep kafka-flume-hdfs.conf | grep -v grep | awk '{print \$2}' | xargs -n1 kill -9"
        done 
;;

*)
  echo "USAGE: f1.sh {start|stop}"
  exit
;;
esac 

flume内存调整

在 $FLUME_HOME/conf/flume-env.sh下,修改 JAVA_OPTS 下的 -Xms 和 -Xmx

sqoop 配置文件

sqoop-env.sh

export HADOOP_COMMON_HOME=/opt/module/hadoop-3.2.2
export HADOOP_MAPRED_HOME=/opt/module/hadoop-3.2.2
export HIVE_HOME=/opt/module/apache-hive-3.1.2-bin
export ZOOKEEPER_HOME=/opt/module/apache-zookeeper-3.6.3-bin
export ZOOCFGDIR=/opt/module/apache-zookeeper-3.6.3-bin/conf

验证

sqoop list-databases --connect "jdbc:mysql://hadoop001:3306/gmall" --username root --password hadoop

Sqoop将数据导入到HDFS

sqoop import \
--connect "jdbc:mysql://hadoop001:3306/gmall" \
--username root --password hadoop \
--table user_info \
--columns id,login_name,nick_name \
--where "id >=100 and id <= 200" \
--target-dir /testsqoop \
--delete-target-dir \
--num-mappers 2 \
--split-by id \
--fields-terminated-by "\t"

sqoop import \
--connect "jdbc:mysql://hadoop001:3306/gmall" \
--username root --password hadoop \
--query "select id,login_name,nick_name from user_info where id >= 100 and id <=200 and \$CONDITIONS" \
--target-dir /testsqoop \
--delete-target-dir \
--num-mappers 2 \
--split-by id \
--fields-terminated-by "\t" \
--compress \
--compression-codec lzop \
--null-string '\\N' \
--null-non-string '\\N'

脚本

#! /bin/bash

APP=gmall
sqoop=/opt/module/sqoop-1.4.6/bin/sqoop

# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n "$2" ] ;then
   do_date=$2
else 
   echo "请传入日期参数"
   exit
fi 

import_data(){
$sqoop import \
--connect jdbc:mysql://hadoop001:3306/$APP \
--username root \
--password hadoop \
--target-dir /origin_data/$APP/db/$1/$do_date \
--delete-target-dir \
--query "$2 where \$CONDITIONS" \
--num-mappers 1 \
--fields-terminated-by '\t' \
--compress \
--compression-codec lzop \
--null-string '\\N' \
--null-non-string '\\N'

hadoop jar /opt/module/hadoop-3.2.2/share/hadoop/common/hadoop-lzo-0.4.20.jar com.hadoop.compression.lzo.DistributedLzoIndexer /origin_data/$APP/db/$1/$do_date
}

import_order_info(){
  import_data order_info "select
                            id, 
                            total_amount, 
                            order_status, 
                            user_id, 
                            payment_way,
                            delivery_address,
                            out_trade_no, 
                            create_time, 
                            operate_time,
                            expire_time,
                            tracking_no,
                            province_id,
                            activity_reduce_amount,
                            coupon_reduce_amount,                            
                            original_total_amount,
                            feight_fee,
                            feight_fee_reduce      
                        from order_info"
}

case $1 in
  "order_info")
     import_order_info
;;
  "all")
   import_base_category1
;;
esac

hive 配置文件

挪动lib下的jar

mv $HIVE_HOME/lib/log4j-slf4j-impl-2.10.0.jar $HIVE_HOME/lib/log4j-slf4j-impl-2.10.0.jar.bak
cp mysql-connector-java-5.1.32.jar $HIVE_HOME/lib/
cp /opt/module/hadoop-3.2.2/share/hadoop/common/lib/guava-27.0-jre.jar $HIVE_HOME/lib/

hive-site.xml

<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
    <name>hive.exec.scratchdir</name>
    <value>/opt/module/apache-hive-3.1.2-bin/tmp</value>
 </property>

<!-- Hive默认在HDFS的工作目录 -->
<property>
    <name>hive.metastore.warehouse.dir</name>
    <value>/user/hive/warehouse</value>
 </property>
 <property>
    <name>hive.querylog.location</name>
    <value>/user/hive/log</value>
 </property>
  <property>
    <name>javax.jdo.option.ConnectionURL</name>
    <value>jdbc:mysql://hadoop001:3306/hive?createDatabaseIfNotExist=true</value>
    <description>JDBC connect string for a JDBC metastore</description>
  </property>
  <property>
    <name>javax.jdo.option.ConnectionDriverName</name>
    <value>com.mysql.jdbc.Driver</value>
    <description>Driver class name for a JDBC metastore</description>
  </property>
  <property>
    <name>javax.jdo.option.ConnectionUserName</name>
    <value>root</value>
    <description>username to use against metastore database</description>
  </property>
  <property>
    <name>javax.jdo.option.ConnectionPassword</name>
    <value>hadoop</value>
    <description>password to use against metastore database</description>
  </property>

    <!-- Hive元数据存储的验证 -->
    <property>
        <name>hive.metastore.schema.verification</name>
        <value>false</value>
    </property>

    <!-- 指定hiveserver2连接的端口号 -->
    <property>
    <name>hive.server2.thrift.port</name>
    <value>10010</value>
    </property>

    <!-- 指定hiveserver2连接的host -->
    <property>
        <name>hive.server2.thrift.bind.host</name>
        <value>hadoop001</value>
    </property>

    <!-- 元数据存储授权  -->
    <property>
        <name>hive.metastore.event.db.notification.api.auth</name>
        <value>false</value>
    </property>
    
     <!-- 指定存储元数据要连接的地址 -->
    <property>
        <name>hive.metastore.uris</name>
        <value>thrift://hadoop001:9083</value>
    </property>
      
    <property>
        <name>hive.cli.print.header</name>
        <value>true</value>
    </property>

    <property>
        <name>hive.cli.print.current.db</name>
        <value>true</value>
    </property>
</configuration>

hive-log4j2.properties.

property.hive.log.dir = /opt/module/apache-hive-3.1.2-bin/logs

hive on spark

hive 3.1.2 要使用spark 3.x,需要重新编译hive
hive的conf目录下配置spark-defaults.conf文件

spark.master yarn
spark.yarn.historyServer.address hadoop001:18080
spark.history.ui.port 18080
spark.eventLog.enabled true
spark.eventLog.dir hdfs://mycluster/spark/sparkhistory
spark.eventLog.compress true
spark.driver.memory 1g
spark.executor.memory 2g

创建hdfs://mycluster/spark/sparkhistory 目录
将spark的安装目录上传到hdfs上一份(上传的是spark without hadoop)

hadoop fs -mkdir /spark-jars
hadoop fs -put /opt/module/spark-3.2.0-bin-hadoop3.2/jars/* /spark-jars
hadoop fs -rm -r -f /spark-jars/guava-14.0.1.jar
hadoop fs -put /opt/module/apache-hive-3.1.2-bin/lib/guava-27.0-jre.jar /spark-jars
hadoop fs -rm -r -f /spark-jars/hive*
hadoop fs -put /opt/module/hadoop-3.2.2/share/hadoop/common/hadoop-lzo-0.4.20.jar /spark-jars

hive-site.xml配置

<!--Spark依赖位置(注意:端口号8020必须和namenode的端口号一致)-->
<property>
    <name>spark.yarn.jars</name>
    <value>hdfs://mycluster/spark-jars/*</value>
</property>

<!--Hive执行引擎-->
<property>
    <name>hive.execution.engine</name>
    <value>spark</value>
</property>

<!--Hive和Spark连接超时时间-->
<property>
    <name>hive.spark.client.connect.timeout</name>
    <value>10000ms</value>
</property>

修改 hadoop的capacity-scheduler.xml,分发,重启yarn

<property>
    <name>yarn.scheduler.capacity.maximum-am-resource-percent</name>
    <value>0.8</value>
</property

测试hive on spark

beeline -u jdbc:hive2://hadoop001:10010 -n root
create table student(id int, name string);
insert into table student values(1,'abc');

Spark on Yarn 配置文件

版本:3.2.0
检查yarn-site.xml

<!--是否启动一个线程检查每个任务正使用的物理内存量,如果任务超出分配值,则直接将其杀掉,默认是true -->
<property>
     <name>yarn.nodemanager.pmem-check-enabled</name>
     <value>false</value>
</property>

<!--是否启动一个线程检查每个任务正使用的虚拟内存量,如果任务超出分配值,则直接将其杀掉,默认是true -->
<property>
     <name>yarn.nodemanager.vmem-check-enabled</name>
     <value>false</value>
</property>

把hive-site.xml cp到spark conf下,并修改如下配置

<property>
    <name>hive.server2.thrift.port</name>
    <value>10000</value>
</property>

<property>
    <name>hive.exec.dynamic.partition.mode</name>
    <value>nonstrict</value>
</property>

cp mysql-connector到jars下
cp 编译好的lzo压缩包到jars下(不用lzo压缩则不用)
spark-env.sh

export JAVA_HOME=/opt/module/jdk1.8.0_181
export SCALA_HOME=/opt/module/scala-2.12.11
export SPARK_CONF_DIR=/opt/module/spark-3.2.0-bin-hadoop3.2/conf
export HADOOP_CONF_DIR=/opt/module/hadoop-3.2.2/etc/hadoop
export YARN_CONF_DIR=/opt/module/hadoop-3.2.2/etc/hadoop
export SPARK_HISTORY_OPTS="-Dspark.history.ui.port=18080 -Dspark.history.retainedApplications=30  -Dspark.history.fs.logDirectory=hdfs://mycluster/spark/sparkhistory"

spark-defauls.conf

#指定Spark master为yarn
spark.master=yarn
#Spark历史服务器地址
spark.yarn.historyServer.address hadoop001:18080
#Spark历史服务器读取历史任务日志的路径
spark.history.fs.logDirectory=hdfs://mycluster/spark/sparkhistory
spark.history.ui.port 18080
#是否记录Spark任务日志
spark.eventLog.enabled true
#Spark任务日志的存储路径
spark.eventLog.dir hdfs://mycluster/spark/sparkhistory
spark.eventLog.compress true

#开启Spark-sql自适应优化
spark.sql.adaptive.enabled=true
#开启Spark-sql中Reduce阶段分区数自适应
spark.sql.adaptive.coalescePartitions.enabled=true
#使用Hive提供的Parquet文件的序列化和反序列化工具,以兼容Hive
spark.sql.hive.convertMetastoreParquet=false
#使用老版的Parquet文件格式,以兼容Hive
spark.sql.parquet.writeLegacyFormat=true
#解决SPARK-21725问题
spark.hadoop.fs.hdfs.impl.disable.cache=true
#降低Spark-sql中类型检查级别,兼容Hive
spark.sql.storeAssignmentPolicy=LEGACY

启动历史服务器

hadoop fs -mkdir -p hdfs://mycluster/spark/sparkhistory
sbin/start-history-server.sh 

提交应用

bin/spark-submit \
--class org.apache.spark.examples.SparkPi \
--master yarn \
--deploy-mode client \
./examples/jars/spark-examples_2.12-3.0.0.jar \
10
posted @ 2022-01-11 11:54  蟹与西梅  阅读(103)  评论(0)    收藏  举报