hadoop技术学习笔记

zookeeper

xxx

环境搭建

  1. 下载zookeeper安装包
  2. 配置zookeeper
    1. 复制模板文件zoo_sample.cfg到当前文件夹并重命名为zoo.cfg
    2. 修改zoo.cfg文件
      # 修改
      dataDir=/export/servers/zookeeper-3.4.9/zkdatas # 保存节点数据的目录
      
      # 启动
      autopurge.snapRetainCount=3
      autopurge.purgeInterval=1
      
      # 添加
      server.1=node01:2888:3888
      server.2=node02:2888:3888
      server.3=node03:2888:3888
      
    3. 在zookeeper目录下创建zkdatas文件夹
    4. 在zkdatas文件夹中创建myid文件并添加一个编号(编号越大,投票越容易成为主机)
      myid
      1
      
    5. 启动zokeeper
      当前目录为zookeeper
      # bin/zkServer.sh start
      # jps  查看是否启动成功
      # bin/zkServer.sh status  查看主从
      # bin/zkCli.sh  进入命令行模式
      

hadoop

版本:2.7.5
java:1.8.301_64bit

环境配置

1. 修改配置文件

  1. hadoop/etc/hadoop/core-site.xml
    <configuration>
    	<!-- 指定集群文件类型:分布式文件系统 -->
      <property>
    	<name>fs.default.name</name>
    	<value>hdfs://192.168.52.100:8020</value>
      </property>
    	<!--设置临时存放目录 -->
      <property>
    	<name>hadoop.tmp.dir</name>
    	<value>export/servers/hadoop-2.7.5/hadoopDatas/tempDatas</value>
      </property>
      <!-- 缓冲区大小 -->
      <property>
    	<name>io.file.buffer.size</name>
    	<value>4096</value>
      </property>
      <!-- 开启回收站,回收站内容删除时间(分钟) -->
      <property>
    	<name>fs.trash.interval</name>
    	<value>10080</value>
      </property>
    </configuration>
    
  2. hadoop/etc/hadoop/hdfs-site.xml
    <configuration>
      <property>
    	<name>dfs.namenode.secondary.http-address</name>
    	<value>node01:50090</value> 
      </property>
    
      <!-- 指定namenode的访问地址和主节点 -->
      <property>
    	<name>dfs.namenode.http-address</name>
    	<value>node01:50090</value> 
      </property>
      <!-- 指定namenode元数据的存放路径 -->
      <property>
    	<name>dfs.namenode.name.dir</name>
    	<value>file:///export/servers/hadoop-2.7.5/hadoopDatas/namenodeDatas,file:///export/servers/hadoop-2.7.5/hadoopDatas/namenodeDatas2</value> 
      </property>
      <!-- 指定datanode数据的存放路径 -->
      <property>
    	<name>dfs.datanode.data.dir</name>
    	<value>file:///export/servers/hadoop-2.7.5/hadoopDatas/datanodeDatas,file:///export/servers/hadoop-2.7.5/hadoopDatas/datanodeDatas2</value> 
      </property>
      <!-- 指定namenode日志文件的存放路径 -->
      <property>
    	<name>dfs.namenode.edits.dir</name>
    	<value>file:///export/servers/hadoop-2.7.5/hadoopDatas/nn/edits</value> 
      </property>
      <property>
    	<name>dfs.namenode.checkpoint.dir</name>
    	<value>file:///export/servers/hadoop-2.7.5/hadoopDatas/snn/name</value> 
      </property>
      <property>
    	<name>dfs.namenode.checkpoint.edits.dir</name>
    	<value>file:///export/servers/hadoop-2.7.5/hadoopDatas/dfs/snn/edits</value> 
      </property>
      <!-- 文件切片的副本个数 -->
      <property>
    	<name>dfs.replication</name>
    	<value>3</value> 
      </property>
      <!-- 设置hdfs文件权限 -->
      <property>
    	<name>dfs.permissions</name>
    	<value>false</value>
      </property>
      <!-- 设置一个文件切片的大小 128MB -->
      <property>
    	<name>dfs.blocksize</name>
    	<value>134217728</value>
      </property>
    </configuration>
    
  3. hadoop/etc/hadoop/httpfs-env.sh
    export JAVA_HOME=/export/servers/jdk1.8.0_301
    
  4. hadoop-2.7.5/etc/hadoop/mapred-site.xml
    <configuration>
      <!-- 开启MapReduce小任务模式 -->
      <property>
    	<name>mapreduce.job.ubertask.enable</name>
    	<value>true</value>
      </property>
      <!-- 设置历史任务的主机和端口 -->
      <property>
    	<name>mapreduce.jobhistory.address</name>
    	<value>node01:10020</value>
      </property>
      <!-- 设置网页访问历史任务的主机和端口 -->
      <property>
    	<name>mapreduce.jobhistory.webapp.address</name>
    	<value>node01:19888</value>
      </property>
    </configuration>
    
  5. hadoop/etc/hadoop/yarn-site.xml
    <property>
    	<name>yarn.resourcemanager.hostname</name>
    	<value>node01</value>
    </property>
    <property>
    	<name>yarn.nodemanager.aux-services</name>
    	<value>mapreduce_shuffle</value>
    </property>
    
    <property>
    	<name>yarn.log-aggregation-enable</name>
    	<value>true</value>
    </property>
    <property>
    	<name>yarn.log-aggregation.retain-seconds</name>
    	<value>604800</value>
    </property>
    <property>    
    	<name>yarn.nodemanager.resource.memory-mb</name>    
    	<value>20480</value>
    </property>
    <property>  
        	 <name>yarn.scheduler.minimum-allocation-mb</name>
         	<value>2048</value>
    </property>
    <property>
    	<name>yarn.nodemanager.vmem-pmem-ratio</name>
    	<value>2.1</value>
    </property>
    
  6. hadoop/etc/hadoop/mapred-env
    export JAVA_HOME=/export/servers/jdk1.8.0_301/
    
  7. hadoop/etc/hadoop/slaves
    node01
    node02
    node03
    

2. 创建所用目录

mkdir -p /export/servers/hadoop-2.7.5/hadoopDatas/tempDatas
mkdir -p /export/servers/hadoop-2.7.5/hadoopDatas/namenodeDatas
mkdir -p /export/servers/hadoop-2.7.5/hadoopDatas/namenodeDatas2
mkdir -p /export/servers/hadoop-2.7.5/hadoopDatas/datanodeDatas
mkdir -p /export/servers/hadoop-2.7.5/hadoopDatas/datanodeDatas2
mkdir -p /export/servers/hadoop-2.7.5/hadoopDatas/nn/edits
mkdir -p /export/servers/hadoop-2.7.5/hadoopDatas/snn/name
mkdir -p /export/servers/hadoop-2.7.5/hadoopDatas/dfs/snn/edits

3. 拷贝hadoop目录到node02,node03

- /export/servers/
# scp -r hadoop-2.7.5 node02:$PWD
# scp -r hadoop-2.7.5 node03:$PWD

4. 配置hadoop环境变量

# add
export HADOOP_HOME=/export/servers/hadoop-2.7.5
export PATH=:$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$PATH

5. 启动集群

...

hdfs

1. 副本机制

2. 机架感知

机架:存放主机的柜子

hdfs命令行

# hdfs dfs -ls

3. 文件写入

4. fsimage和edits

nameNode中的数据是存储在内存中的,这两个是将nameNode持久化存储

fsimage中存储nodeName中的元数据(不会同步跟新nodeName)
edits中存储最近一段时间的元数据操作日志

当nameNode启动时,会将fsimage中的内容加载到nodeName中,在执行edits中的数据,创建没有及时在fsimage中数据

5. secondaryNameNode

每隔一段时间或者fsimage容量达到上限就会执行(可配置core-site.xml

graph TD subgraph secondaryNameNode A[fsimage] -->|合并| C B[edits] -->|合并| C C[新fsimage] end

将新fsimage替换掉旧fsimage,清空edits

posted @ 2021-07-27 23:08  panmengxiang  阅读(52)  评论(0)    收藏  举报