Hadoop三大件安装及工作原理

安装

在SecureCRT中通过sftp上传到虚拟机

lcd C:\Users\cp130\Desktop\hadoop
put hadoop-2.7.4.tar.gz zookeeper-3.4.14.tar.gz jdk-8u211-linux-x64.tar.gz

解压缩

mkdir /opt/java && tar -zxf /root/jdk-8u211-linux-x64.tar.gz -C /opt/java &&\
mkdir /opt/zookeeper && tar -zxf /root/zookeeper-3.4.14.tar.gz -C /opt/zookeeper &&\
mkdir /opt/hadoop && tar -zxf /root/hadoop-2.7.4.tar.gz -C /opt/hadoop

系统环境

echo '
#java
export JAVA_HOME=/opt/java/jdk1.8.0_211
export JRE_HOME=/opt/java/jdk1.8.0_211/jre
export PATH=$PATH:$JAVA_HOME/bin:$JRE_HOME/bin
export CLASSPATH=.:$JAVA_HOME/lib/dt.jar:$JAVA_HOME/lib/tools.jar:$JRE_HOME/lib

#hadoop
export HADOOP_HOME=/opt/hadoop/hadoop-2.7.4
export PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
export HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_HOME/lib/native
export HADOOP_OPTS="-Djava.library.path=$HADOOP_HOME/lib:$HADOOP_COMMON_LIB_NATIVE_DIR"
export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop

#zookeeper
export ZOOKEEPER_HOME=/opt/zookeeper/zookeeper-3.4.14
export PATH=$PATH:$ZOOKEEPER_HOME/bin
' >> /etc/profile && source /etc/profile

配置Zookeeper

mv /opt/zookeeper/zookeeper-3.4.14/conf/zoo_sample.cfg /opt/zookeeper/zookeeper-3.4.14/conf/zoo.cfg &&\
sed -ri 's/\/tmp\/zookeeper/\/data\/zookeeper/g' /opt/zookeeper/zookeeper-3.4.14/conf/zoo.cfg &&\
cat >> /opt/zookeeper/zookeeper-3.4.14/conf/zoo.cfg <<EOF
server.1=worker3:2888:3888
server.2=worker4:2888:3888
server.3=worker5:2888:3888
EOF

配置Hadoop

配置HA模式,文件会很繁琐

#-----------------------------------------------------------------------#
#                                  HDFS配置                             #
#-----------------------------------------------------------------------#

#/opt/hadoop/hadoop-2.7.4/etc/hadoop/hadoop-env.sh
sed -ri 's/\$\{JAVA_HOME\}/\/opt\/java\/jdk1.8.0_211/g' /opt/hadoop/hadoop-2.7.4/etc/hadoop/hadoop-env.sh

#/opt/hadoop/hadoop-2.7.4/etc/hadoop/slaves
cat > /opt/hadoop/hadoop-2.7.4/etc/hadoop/slaves <<EOF
worker3
worker4
worker5
EOF

#/opt/hadoop/hadoop-2.7.4/etc/hadoop/core-site.xml
sed -i '/^<configuration>/,/<\/configuration>$/d' /opt/hadoop/hadoop-2.7.4/etc/hadoop/core-site.xml &&\
cat >> /opt/hadoop/hadoop-2.7.4/etc/hadoop/core-site.xml <<EOF
<configuration>
    <property>
        <name>fs.defaultFS</name>
        <value>hdfs://NameNs</value>
    </property>
    <property>
        <name>hadoop.tmp.dir</name>
        <value>file:///data/hadoop/tmp</value>
    </property>
    <property>
        <name>hadoop.http.staticuser.user</name>
        <value>root</value>
    </property>
    <property>
        <name>ha.zookeeper.quorum</name>
        <value>worker3:2181,worker4:2181,worker5:2181</value>
    </property>
    <property>
        <name>hadoop.http.staticuser.user</name>
        <value>hadoop</value>
    </property> 
    <!-- Hue WebHDFS proxy user setting -->
    <property>
        <name>hadoop.proxyuser.root.hosts</name>
        <value>*</value>
    </property>
    <property>
        <name>hadoop.proxyuser.root.groups</name>
        <value>*</value>
    </property>
    <property>
         <name>hadoop.proxyuser.hue.hosts</name>
         <value>*</value>
    </property>
    <property>
         <name>hadoop.proxyuser.hue.groups</name>
         <value>*</value>
    </property>
    <property>
        <name>hadoop.proxyuser.httpfs.hosts</name>
        <value>*</value>
    </property>
    <property>
        <name>hadoop.proxyuser.httpfs.groups</name>
        <value>*</value>
    </property>
        <property>
        <name>hadoop.proxyuser.hbase.hosts</name>
        <value>*</value>
    </property>
    <property>
        <name>hadoop.proxyuser.hbase.groups</name>
        <value>*</value>
    </property>
    <property>
        <name>hbase.thrift.support.proxyuser</name>
        <value>true</value>
    </property>
    <property>
        <name>hbase.regionserver.thrift.http</name>
        <value>true</value>
    </property>
</configuration>
EOF

#/opt/hadoop/hadoop-2.7.4/etc/hadoop/hdfs-site.xml
sed -i '/^<configuration>/,/<\/configuration>$/d' /opt/hadoop/hadoop-2.7.4/etc/hadoop/hdfs-site.xml &&\
cat >> /opt/hadoop/hadoop-2.7.4/etc/hadoop/hdfs-site.xml <<EOF
<configuration>
    <property>
        <name>dfs.replication</name>
        <value>2</value>
    </property>
    <property>
        <name>dfs.permissions.enabled</name>
        <value>false</value>
    </property>
    <property>
        <name>dfs.blocksize</name>
        <value>134217728</value>
    </property>
    <property>
        <name>dfs.nameservices</name>
        <value>NameNs</value>
    </property>
    <property>
        <name>dfs.ha.namenodes.NameNs</name>
        <value>NameN1,NameN2</value>
    </property>
    <property>
        <name>dfs.namenode.rpc-address.NameNs.NameN1</name>
        <value>worker3:9000</value>
    </property>
    <property>
        <name>dfs.namenode.http-address.NameNs.NameN1</name>
        <value>worker3:50070</value>
    </property>
    <property>
        <name>dfs.namenode.rpc-address.NameNs.NameN2</name>
        <value>worker4:9000</value>
    </property>
    <property>
        <name>dfs.namenode.http-address.NameNs.NameN2</name>
        <value>worker4:50070</value>
    </property>
    <property>
        <name>dfs.namenode.shared.edits.dir</name>
        <value>qjournal://worker3:8485;worker4:8485;worker5:8485/NameNs</value>
    </property>
    <property>
        <name>dfs.journalnode.edits.dir</name>
        <value>/data/hadoop/journal</value>
    </property>
    <property>
        <name>dfs.client.failover.proxy.provider.NameNs</name>
        <value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
    </property>
    <property>
        <name>dfs.ha.fencing.methods</name>
        <value>
            sshfence
            shell(/bin/true)
        </value>
    </property>
    <property>
        <name>dfs.ha.fencing.ssh.private-key-files</name>
        <value>/root/.ssh/id_rsa</value>
    </property>
    <property>
        <name>dfs.ha.fencing.ssh.connect-timeout</name>
        <value>30000</value>
    </property>
    <property>
        <name>dfs.ha.automatic-failover.enabled</name>
        <value>true</value>
    </property>
        <property>
        <name>dfs.webhdfs.enabled</name>
        <value>true</value>
    </property>
    <property>
        <name>dfs.namenode.name.dir</name>
        <value>file:///data/hadoop/dfs/name</value>
    </property>
    <property>
        <name>dfs.datanode.data.dir</name>
        <value>file:///data/hadoop/dfs/data</value>
    </property>
</configuration>
EOF

#/opt/hadoop/hadoop-2.7.4/etc/hadoop/mapred-site.xml
mv /opt/hadoop/hadoop-2.7.4/etc/hadoop/mapred-site.xml.template /opt/hadoop/hadoop-2.7.4/etc/hadoop/mapred-site.xml &&\
sed -i '/^<configuration>/,/<\/configuration>$/d' /opt/hadoop/hadoop-2.7.4/etc/hadoop/mapred-site.xml &&\
cat >> /opt/hadoop/hadoop-2.7.4/etc/hadoop/mapred-site.xml <<EOF
<configuration>
    <property>
        <name>mapreduce.framework.name</name>
        <value>yarn</value>
    </property>
    <property>
        <name>mapreduce.jobhistory.address</name>
        <value>worker3:10020</value>
    </property>
    <property>
        <name>mapreduce.jobhistory.webapp.address</name>
        <value>worker3:19888</value>
    </property>
    <property>
        <name>mapreduce.map.env</name>
        <value>HADOOP_MAPRED_HOME=/opt/hadoop/hadoop-2.7.4</value>
    </property>
    <property>
        <name>mapreduce.reduce.env</name>
        <value>HADOOP_MAPRED_HOME=/opt/hadoop/hadoop-2.7.4</value>
    </property>
</configuration>
EOF

#/opt/hadoop/hadoop-2.7.4/etc/hadoop/yarn-site.xml
sed -i '/^<configuration>/,/<\/configuration>$/d' /opt/hadoop/hadoop-2.7.4/etc/hadoop/yarn-site.xml &&\
cat >> /opt/hadoop/hadoop-2.7.4/etc/hadoop/yarn-site.xml <<EOF
<configuration>
    <property>
        <name>yarn.nodemanager.aux-services</name>
        <value>mapreduce_shuffle</value>
    </property>
    <property>
        <name>yarn.log-aggregation-enable</name>
        <value>true</value>
    </property>
    <property>
        <name>yarn.log-aggregation.retain-seconds</name>
        <value>106800</value>
    </property>
    <property>
        <name>yarn.resourcemanager.ha.enabled</name>
        <value>true</value>
    </property>
    <property>
        <name>yarn.resourcemanager.cluster-id</name>
        <value>cluster1</value>
    </property>
    <property>
        <name>yarn.resourcemanager.ha.rm-ids</name>
        <value>rm1,rm2</value>
    </property>
    <property>
        <name>yarn.resourcemanager.hostname.rm1</name>
        <value>worker3</value>
    </property>
    <property>
        <name>yarn.resourcemanager.hostname.rm2</name>
        <value>worker4</value>
    </property>
    <property>
        <name>yarn.resourcemanager.zk-address</name>
        <value>worker3:2181,worker4:2181,worker5:2181</value>
    </property>
</configuration>
EOF

分别启动各组件

# --------------------------------- 初始化----------------------------- #

[root@worker3]# mkdir -p /data/hadoop/dfs/name
[root@worker4]# mkdir -p /data/hadoop/dfs/name
[root@worker5]# mkdir -p /data/hadoop/dfs/name

[root@worker3]# mkdir /data/zookeeper && echo 1 > /data/zookeeper/myid
[root@worker4]# mkdir /data/zookeeper && echo 2 > /data/zookeeper/myid
[root@worker5]# mkdir /data/zookeeper && echo 3 > /data/zookeeper/myid

[root@worker3]# mkdir -p /data/hbase/tmp
[root@worker4]# mkdir -p /data/hbase/tmp
[root@worker5]# mkdir -p /data/hbase/tmp

[root@worker3]# schematool -dbType mysql -initSchema

# -------------------------------- 启动集群---------------------------- #
# ZooKeeper
[root@worker3]# zkServer.sh start
[root@worker4]# zkServer.sh start
[root@worker5]# zkServer.sh start


# Hdfs-HA  start-all.sh
# This script is Deprecated. Instead use start-dfs.sh and start-yarn.sh
[root@worker3]# hdfs zkfc -formatZK

[root@worker3]# hadoop-daemon.sh start journalnode
[root@worker4]# hadoop-daemon.sh start journalnode
[root@worker5]# hadoop-daemon.sh start journalnode

[root@worker3]# hdfs namenode -format NameNs
[root@worker3]# hadoop-daemon.sh start namenode
[root@worker3]# hadoop-daemon.sh start zkfc

[root@worker4]# hdfs namenode -bootstrapStandby
[root@worker4]# hadoop-daemon.sh start namenode
[root@worker4]# hadoop-daemon.sh start zkfc

[root@worker3]# hadoop-daemon.sh start datanode
[root@worker4]# hadoop-daemon.sh start datanode
[root@worker5]# hadoop-daemon.sh start datanode

	
# Yarn
[root@worker3]# yarn-daemon.sh start resourcemanager
[root@worker4]# yarn-daemon.sh start resourcemanager
[root@worker3]# yarn-daemon.sh start nodemanager
[root@worker4]# yarn-daemon.sh start nodemanager
[root@worker5]# yarn-daemon.sh start nodemanager

原理

当数据量多的时候,单机无法处理,Hadoop是为了解决海量计算而应运而生的方案,包含3和方面:存储平台,计算逻辑,计算资源

存储平台对应HDFS,将数据看成矩阵,它由两部分构成,方向和内容,对应到HDFS里面也是如此,HDFS将文件名和文件内容分开,NameNode存储文件名,并记录文件名和对应的文件具体内容的地址,DataNode则存储文件内容,通过切割文件成块,以及冗余备份,来存储大文件,两者结合成HDFS向外暴露文件,Hbase是进一步加工一层,将稀疏矩阵的方向,由单层抽象成树状展开。
计算逻辑对应MapReduce,虽然计算的算子多种多样,但抽象来看,也只分两类:f(x)和f(x,y),MapReduce是这两种算子实现的最简陋方式,Hive也是计算逻辑的一种表达方式,以SQL的形式暴露api,底层也是这两种方式,spark的RDD也是这两类算子构成,整个计算流是一张单向无环图,DaTaFrame也是SQL的一种表达方式。
计算资源对应Yarn,它的调度方式是一个1:N的形态,指定一个manager来调度资源,具体的计算则由worker来执行。

示例

posted @ 2020-05-18 18:02  Cshare  阅读(221)  评论(0)    收藏  举报