hadoop cluster construction

1, Hadoop cluster construction

1. Installing virtual machines

1. Installing vmtools
hadoop@ubuntu:sudo apt-get install open-vm-tools-desktop -y
2. Install vim editor
hadoop@ubuntu:sudo apt install vim

2. Install jdk

1. Unzip the installation package
hadoop@ubuntu:~$ sudo tar -zxvf jdk-8u171-linux-x64.tar.gz -C /usr/local
2. Modify environment variables
hadoop@ubuntu:~$ sudo vim ~/.bashrc
#JAVA
export JAVA_HOME=/usr/local/jdk1.8.0_121
PATH=$PATH:$JAVA_HOME/bin
3. Environment variable validation
hadoop@ubuntu:~$ source ~/.bashrc
4. Check whether the jdk is installed successfully
hadoop@ubuntu:~$ java -version
java version "1.8.0_121"
Java(TM) SE Runtime Environment (build 1.8.0_121-b13)
Java HotSpot(TM) 64-Bit Server VM (build 25.121-b13, mixed mode)

3. Installing hadoop

1. Unzip the installation package
hadoop@ubuntu:~$ sudo tar -zxvf hadoop-2.7.7.tar.gz -C /usr/local
2. Modify environment variables
hadoop@ubuntu:~$ sudo vim ~/.bashrc
#Hadoop
export HADOOP_HOME=/usr/local/hadoop-2.7.7
PATH=$PATH:$JAVA_HOME/bin:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
3. Environment variable validation
hadoop@ubuntu:~$ source ~/.bashrc

4. Clone two child nodes

jdk and hadoop do not require installation configuration

5. Configure hostname, static IP address, hostname, and IP mapping

1. Log in as the root user and modify the host names of the three virtual machines: wei maser, slave1 and slave2

1. / / the host name of the first virtual machine is changed to master

root@ubuntu:~# vi /etc/hostname
master

2. / / the host name of the second virtual machine is changed to slave 1

root@ubuntu:~# vi /etc/hostname
slave1
root@ubuntu:~# reboot

3. / / the host name of the third virtual machine is changed to slave2

root@ubuntu:~# vi /etc/hostname
slave2
root@ubuntu:~# reboot
2. Log in as root and set the static ip address (all three nodes need to be configured)

1. / / take mster as an example, modify the network configuration file as follows:

root@master:~# vim /etc/netplan/01-network-manager-all.yaml
# Let NetworkManager manage all devices on this system
#network:
# version: 2
# renderer: NetworkManager
network:
    ethernets:
        ens33:                  # Configured network card name
            dhcp4: no           # Close dhcp4
            dhcp6: no           # Close dhcp6
            addresses: [192.168.126.143/24]       # Set local IP address and mask
            gateway4: 192.168.126.2               # Set gateway
            nameservers:
                    addresses: [192.168.126.2, 114.114.114.114, 8.8.8.8]       # Set DNS
    version: 2

Note: the gateway must be the same as that in vmnet8 of the host computer

2. Configuration effective

root@master:~# netplan apply
3. Configure the mapping relationship between ip and host name

//Log in as root and configure the mapping relationship between ip and host name on the three virtual machines

root@master:~# vi /etc/hosts

127.0.0.1 localhost

192.168.126.143 master
192.168.126.146 slave1
192.168.126.147 slave2
root@slave1:~# vi /etc/hosts

127.0.0.1 localhost

192.168.126.143 master
192.168.126.146 slave1
192.168.126.147 slave2
root@slave2:~# vi /etc/hosts

127.0.0.1 localhost

192.168.126.143 master
192.168.126.146 slave1
192.168.126.147 slave2

6. Configure secret free ssh

1. Log in as root and open port 22 on each linux host
hadoop@master:~$ su - root
Password: 
root@ master:~# vim /etc/ssh/ssh_config
port 22
2. Each linux host installs openssh server and generates a key pair
//The hadoop user logs in to the master
root@master:~# su - hadoop
hadoop@master:~$ sudo apt install openssh-server
hadoop@master:~$ ssh-keygen -t rsa
hadoop@master:~$ ls ~/.ssh
id_rsa  id_rsa.pub
// hadoop user login slave1
root@slave1:~# su - hadoop
hadoop@slave1:~$ sudo apt install openssh-server
hadoop@slave1:~$ ssh-keygen -t rsa
hadoop@slave1:~$ ls ~/.ssh
id_rsa  id_rsa.pub
//hadoop user login slave2
root@slave2:~# su - hadoop
hadoop@slave2:~$ sudo apt install openssh-server
hadoop@slave2:~$ ssh-keygen -t rsa
hadoop@slave2:~$ ls ~/.ssh
id_rsa  id_rsa.pub
3. Set secret free login for each node
//hadoop users log in to the master node and copy the public key to a specific file authorized_ In keys
hadoop@master:~/.ssh$ cp id_rsa.pub authorized_keys
//The authorized of the master node_ Keys to slave1 node
hadoop@master:~/.ssh$ scp /home/hadoop/.ssh/authorized_keys  hadoop@slave1:/home/hadoop/.ssh/authorized_keys
authorized_keys

// The hadoop user logs in to slave1 and appends the public key of the slave1 node to authorized_ In keys
hadoop@slave1:~/.ssh$ cat id_rsa.pub >> authorized_keys
//Authorize the slave1 node_ Keys to slave2 node
hadoop@slave1:~/.ssh$ scp /home/hadoop/.ssh/authorized_keys  hadoop@slave2:/home/hadoop/.ssh/authorized_keys

// The hadoop user logs in to slave2 and appends the public key of the slave2 node to authorized_ In keys
hadoop@slave2:~/.ssh$ cat id_rsa.pub >> authorized_keys
//Set authorized in slave2 node_ Keys are transferred to slave1 respectively
hadoop@slave2:~/.ssh$ scp /home/hadoop/.ssh/authorized_keys    hadoop@slave1:/home/hadoop/.ssh/authorized_keys
hadoop@slave1's password: 
authorized_keys                                                                                                                  100% 1196   804.0KB/s   00:00    
//Set authorized in slave2 node_ The keys are transferred to the master
hadoop@slave2:~/.ssh$ scp 				        /home/hadoop/.ssh/authorized_keys    hadoop@master:/home/hadoop/.ssh/authorized_keys
hadoop@master's password: 
authorized_keys     
4. Verify password free login

Log in to each node using ssh to test whether it is secret free

7. Cluster node configuration

1. xml file for configuring master

slaves,core-site.xml,hdfs-site.xml,mapred-site.xml,yarn-site.xml

1,hadoop@master:~$ vim /usr/local/hadoop-2.7.7/etc/hadoop/slaves

slave1
slave2

2,hadoop@master:~$ vim /usr/local/hadoop-2.7.7/etc/hadoop/core-site.xml

<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License. See accompanying LICENSE file.
-->

<!-- Put site-specific property overrides in this file. -->

<configuration>
    <property>
        <name>fs.defaultFS</name>
        <value>hdfs://master:9000</value>
    </property>
    <property>
         <!--hadoop User creation tmp The directory gives read-write execution permission-->
        <name>hadoop.tmp.dir</name>
		<value>/usr/local/hadoop-2.7.7/data/</value>
        <description>A base for other temporary directories.</description>
    </property>
</configuration>

3,hadoop@master:~$ vim /usr/local/hadoop-2.7.7/etc/hadoop/hdfs-site.xml

<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License. See accompanying LICENSE file.
-->

<!-- Put site-specific property overrides in this file. -->

<configuration>
    <property>
        <name>dfs.replication</name>
        <value>1</value>
    </property>
	<!-- appoint Hadoop Secondary name node host configuration -->
	<property>
		<name>dfs.namenode.secondary.http-address</name>
		<value>master:50090</value>
	</property>
    <property>
        <name>dfs.datanode.directoryscan.throttle.limit.ms.per.sec</name>
        <value>1000</value>
    </property>
</configuration>

4,hadoop@master:~$ vim /usr/local/hadoop-2.7.7/etc/hadoop/mapred-site.xml

<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License. See accompanying LICENSE file.
-->

<!-- Put site-specific property overrides in this file. -->

<configuration>
    <property>
        <name>mapreduce.framework.name</name>
        <value>yarn</value>
    </property>
    <property>
        <name>mapreduce.jobhistory.address</name>
        <value>master:10020</value>
    </property>
    <property>
        <name>mapreduce.jobhistory.webapp.address</name>
        <value>master:19888</value>
    </property>
</configuration>

5,hadoop@master:~$ vim /usr/local/hadoop-2.7.7/etc/hadoop/yarn-site.xml

<?xml version="1.0"?>
<!--
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License. See accompanying LICENSE file.
-->
<configuration>
    <property>
        <name>yarn.nodemanager.aux_services</name>
        <value>mapreduce_shuffle</value>
    </property>
    <!-- appoint YARN of ResourceManager Address of -->
	<property>
        <name>yarn.resourcemanager.hostname</name>
        <value>master</value>
    </property>
</configuration>
2. Check yarn env sh ,mapred-env.sh, hadoop-env. Java has been configured in the Java_ Environment variable for home
# The java implementation to use.
export JAVA_HOME=/usr/local/jdk1.8.0_121

8. Synchronize the configuration of each node

1. Create a temporary file in slave1 and modify the permissions
hadoop@slave1:~$ cd /usr/local
hadoop@slave1:/usr/local$ sudo mkdir tmp
hadoop@slave1:/usr/local$ sudo chown -R hadoop tmp
hadoop@slave1:/usr/local$ chgrp hadoop tmp
hadoop@slave1:/usr/local$ chmod -R 777 tmp
2. scp transfer file
hadoop@master:/usr/local$ scp -r /usr/local/jdk1.8.0_121/ hadoop@slave1:/usr/local/tmp
hadoop@master:/usr/local$ scp -r /usr/local/hadoop-2.7.7/ hadoop@slave1:/usr/local/tmp
3. Move temporary files to / usr/local
hadoop@slave1:/usr/local$ su - root
Password: 
root@slave1:~# mv -f /usr/local/tmp/jdk1.8.0_121/ /usr/local/jdk1.8.0_121/
root@slave1:~# mv -f /usr/local/tmp/hadoop-2.7.7/ /usr/local/hadoop-2.7.7/
4. Operations (1-3) are performed again on slave2

9. Start cluster

1. The first time you start the cluster, you need to format it. Do not format it again
2. If you need to reformat, you need 3 steps
hadoop@slave2:/usr/local/hadoop-2.7.7$ stop-all.sh
hadoop@slave2:/usr/local/hadoop-2.7.7/data$ rm -rf *
hadoop@master:/usr/local/hadoop-2.7.7/logs$ rm -rf *
hadoop@master:/usr/local/hadoop-2.7.7/logs$ hdfs namenode –format
21/08/19 19:47:22 INFO util.ExitUtil: Exiting with status 0
3,start-all.sh start all processes
hadoop@master:/usr/local/hadoop-2.7.7/logs$ start-all.sh
4. Check whether each node is normal
//Check whether the master nodes are normal
hadoop@master:/usr/local/hadoop-2.7.7/logs$ jps
7286 Jps
6631 NameNode
6874 SecondaryNameNode
7036 ResourceManager
//Check whether all slave1 nodes are normal
hadoop@master:/usr/local/hadoop-2.7.7/logs$ ssh slave1
hadoop@slave1:~$ jps
2791 DataNode
3213 Jps
3039 NodeManager
hadoop@slave1:~$ exit
//Check whether the slave2 nodes are normal
hadoop@master:/usr/local/hadoop-2.7.7/logs$ ssh slave2
hadoop@slave2:~$ jps
2801 NodeManager
2977 Jps
2553 DataNode
hadoop@slave2:~$ exit

10. Test HDFS and YARN

1. Access in virtual machine

http://localhost:50070

http://master:8088

2, Zookeeper cluster construction

1. Unzip the installation file

hadoop@master:~$ sudo rm -rf /usr/local/zookeeper/
hadoop@master:~$sudo mkdir /usr/local/zookeeper/
hadoop@master:~$sudo chown -R hadoop:hadoop /usr/local/zookeeper/
hadoop@master:~$sudo tar -zxvf ~/Downloads/apache-zookeeper-3.5.9-bin.tar.gz* -C /usr/local/zookeeper/
hadoop@master:~$cd /usr/local/zookeeper
hadoop@master:/usr/local/zookeeper$sudo chown -R hadoop:hadoop apache-zookeeper-3.5.9-bin/
hadoop@master:/usr/local/zookeeper$sudo mv apache-zookeeper-3.5.9-bin/ zookeeper
hadoop@master:/usr/local/zookeeper$ cd zookeeper/
hadoop@master:/usr/local/zookeeper/zookeeper$ ll
total 48
drwxr-xr-x 6 hadoop hadoop  4096 Nov 22 18:33 ./
drwxr-xr-x 3 hadoop hadoop  4096 Nov 22 18:34 ../
drwxr-xr-x 2 hadoop hadoop  4096 Jan  6  2021 bin/
drwxr-xr-x 2 hadoop hadoop  4096 Jan  6  2021 conf/
drwxr-xr-x 5 hadoop hadoop  4096 Jan  6  2021 docs/
drwxr-xr-x 2 hadoop hadoop  4096 Nov 22 18:34 lib/
-rw-r--r-- 1 hadoop hadoop 11358 Oct  5  2020 LICENSE.txt
-rw-r--r-- 1 hadoop hadoop   432 Jan  6  2021 NOTICE.txt
-rw-r--r-- 1 hadoop hadoop  1560 Jan  6  2021 README.md
-rw-r--r-- 1 hadoop hadoop  1347 Jan  6  2021 README_packaging.txt

2. Configure environment variables

hadoop@master:/usr/local$sudo vim ~/.bashrc
#Zookeeper
export ZK_HOME=/usr/local/zookeeper/zookeeper
export PATH=$PATH:$ZK_HOME/bin
hadoop@master:/usr/local$source ~/.bashrc

3. Create data and datalog folders under the specified directory

hadoop@master:/usr/local/zookeeper/zookeeper$mkdir data
hadoop@master:/usr/local/zookeeper/zookeeper$mkdir datalog

4. Configure zoo Cfg file

hadoop@master:/usr/local/zookeeper/zookeeper$cd conf
hadoop@master:/usr/local/zookeeper/zookeeper/conf$cp zoo_sample.cfg zoo.cfg
hadoop@master:/usr/local/zookeeper/zookeeper/conf$vim zoo.cfg
#Thenumberofmillisecondsofeachtick
tickTime=2000
#Thenumberofticksthattheinitial
#synchronizationphasecantake
initLimit=10
#Thenumberofticksthatcanpassbetween
#sendingarequestandgettinganacknowledgement
syncLimit=5
#thedirectorywherethesnapshotisstored.
#donotuse/tmpforstorage,/tmphereisjust
#examplesakes.
dataDir=/usr/local/zookeeper/zookeeper/data
dataLogDir=/usr/local/zookeeper/zookeeper/datalog
#theportatwhichtheclientswillconnect
clientPort=2181
#themaximumnumberofclientconnections.
#increasethisifyouneedtohandlemoreclients
server.1=192.168.126.153:2888:3888 -3 here ip Actual address configuration of cluster environment ip address
server.2=192.168.126.154:2889:3889
server.3=192.168.126.155:2890:3890
maxClientCnxns=60
#
#Besuretoreadthemaintenancesectionofthe
#administratorguidebeforeturningonautopurge.
#
#http://zookeeper.apache.org/doc/current/zookeeperAdmin.html#sc_maintenance
#
#ThenumberofsnapshotstoretainindataDir
#autopurge.snapRetainCount=3
#Purgetaskintervalinhours
#Setto"0"todisableautopurgefeature
#autopurge.purgeInterval=1

5. Configure synchronization to other nodes

1. scp transfer file
hadoop@master:/usr/local$ scp -r /usr/local/zookeeper/ hadoop@slave1:/usr/local/tmp
2. Move the files in the temporary directory to / usr/local
hadoop@slave1:~$ sudo mv -f /usr/local/tmp/zookeeper/ /usr/local/zookeeper/
3. Configure environment variables
root@slave1:~#su - hadoop
hadoop@slave1:~$ sudo vim ~/.bashrc
#Zookeeper
export ZK_HOME=/usr/local/zookeeper/zookeeper
export PATH=$PATH:$ZK_HOME/bin
4. Environment variable validation
hadoop@slave1:~$ source ~/.bashrc
5. Perform the operation (1-4) on slave2 again

6. Create myid file

hadoop@master:~$vim /usr/local/zookeeper/zookeeper/data/myid
1
hadoop@slave1:~$vim /usr/local/zookeeper/zookeeper/data/myid
2
hadoop@slave2:~$vim /usr/local/zookeeper/zookeeper/data/myid
3

7. Start zookeeper

start-up master of zookeeper
hadoop@master:~$ zkServer.sh start
    
start-up slave1 of zookeeper
hadoop@slave1:~$ zkServer.sh start
    
start-up slave2 of zookeeper
hadoop@slave2:~$ zkServer.sh start

8. Test connection zookeeper

hadoop@slave2:/usr/local/zookeeper/zookeeper/bin$zkCli.sh

Keywords: Big Data Hadoop Ubuntu

Added by nano on Wed, 05 Jan 2022 03:46:35 +0200