hadoop cluster 5 node setup

steps

install ubuntu 15.04 desktop on 5 nodes
setup ssh
setup hadoop
config and start hadoop
setup zookeeper
setup hbase
setup sqoop
setup spark
fixed

1. setup ubuntu

download and install ubuntu desktop on 5 nodes

setup static ip address for 5 nodes

node5

 $ sudo pico /etc/network/interfaces
 # interfaces(5) file used by ifup(8) and ifdown(8)
 auto lo
 iface lo inet loopback

 auto eth0
 iface eth0 inet static
 address 192.168.120.155
 netmask 255.255.255.0
 network 192.168.120.0
 broadcast 192.168.120.255
 gateway 192.168.120.1
 dns-nameservers 192.168.10.220 192.168.10.221

 $ sudo pico /etc/resolv.conf 
 # Dynamic resolv.conf(5) file for glibc resolver(3) generated by resolvconf(8)
 #     DO NOT EDIT THIS FILE BY HAND -- YOUR CHANGES WILL BE OVERWRITTEN

 nameserver 192.168.10.220
 nameserver 192.168.10.221

 $ sudo pico /etc/hosts
 127.0.0.1   localhost
 127.0.1.1   node5

 # The following lines are desirable for IPv6 capable hosts
 ::1     ip6-localhost ip6-loopback
 fe00::0 ip6-localnet
 ff00::0 ip6-mcastprefix
 ff02::1 ip6-allnodes
 ff02::2 ip6-allrouters

 192.168.120.155 node5
 192.168.120.154 node4
 192.168.120.153 node3
 192.168.120.152 node2
 192.168.120.151 node1

 $ ping www.baidu.com

node1 node2 node3 node4
```
 # same as node5
```

mac os x yosemite

 $ pico /private/etc/hosts
 ##
 # Host Database
 #
 # localhost is used to configure the loopback interface
 # when the system is booting.  Do not change this entry.
 ##
 127.0.0.1   localhost
 255.255.255.255 broadcasthost
 ::1             localhost 

 192.168.120.155 node5
 192.168.120.154 node4
 192.168.120.153 node3
 192.168.120.152 node2
 192.168.120.151 node1

change apt sources

     $ sudo sed 's@cn.archive.ubuntu.com@mirrors.163.com@' -i /etc/apt/sources.list
     $ sudo sed 's@security.ubuntu.com@mirrors.163.com@' -i /etc/apt/sources.list

set HADOOP_HOME JAVA_HOME

     $ sudo pico .bashrc

     ...
     export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/jre
     export HADOOP_HOME=~/node5/hadoop
     export PATH=$PATH:$HADOOP_HOME
     export PATH=$PATH:$JAVA_HOME
     ...

     $ source .bashrc

firewall
1. check status of firewall
```
 $ sudo ufw status
```
2. stop iptables service
```
 $ sudo ufw disable
```
3. reload / restart iptables service
```
 $ sudo ufw reload
```
4. get ipv4 iptables status
```
 $ sudo iptables -L -n -v
```
5. get ipv6 ip6tables status
```
 $ sudo ip6tables -L -n -v
```

add user

     $ sudo addgroup hadoop
     $ sudo adduser --ingroup hadoop hduser
     $ sudo chown -R hduser:hadoop /opt/bigdata

sublime text

for sublime-text-2:

 $ sudo add-apt-repository ppa:webupd8team/sublime-text-2
 $ sudo apt-get update
 $ sudo apt-get install sublime-text

for sublime-text-3:

 $ sudo add-apt-repository ppa:webupd8team/sublime-text-3
 $ sudo apt-get update
 $ sudo apt-get install sublime-text-installer

install java on five nodes

     $ sudo apt-get install openjdk-7-jdk

2. setup ssh

install openssh-server
1. node1 to node5
```
 $ sudo apt-get install openssh-server
```

generate ssh key

node5

 $ su - hduser
 $ ssh-keygen -t rsa -P "" 
 $ cd ~/.ssh
 $ cat id_rsa.pub >> authorized_keys

setup passphraseless ssh

     $ ssh-copy-id -i id_rsa.pub hduser@node1
     $ ssh-copy-id -i id_rsa.pub hduser@node2
     $ ssh-copy-id -i id_rsa.pub hduser@node3
     $ ssh-copy-id -i id_rsa.pub hduser@node4

check ssh login

     $ ssh hduser@node1
     ...
     exit

     $ ssh hduser@node2
     ...
     exit

     $ ssh hduser@node3
     ...
     exit

     $ ssh hduser@node4
     ...
     exit

remove key from known hosts

     $ rm -f .ssh/known_hosts

     or

     $ ssh-keygen -R "hduser"

3. setup hadoop

topology

                           node5
                      +--------------+
                      |              |
                      |   NameNode   |
                      |              |
                      +--------------+
                     /       |        \
                    /        |         \
                   /         |          \
                  /          |           \
                 /           |            \
                /            |             \
           node2            node3           node4
     +--------------+ +--------------+ +--------------+
     |              | |              | |              |
     |  DataNode    | |  DataNode    | |  DataNode    |
     |              | |              | |              |
     +--------------+ +--------------+ +--------------+

setup

copy software to node

 $ scp hadoop-2.7.0.tar.gz hduser@node5:~/

mkdir to install hadoop

 $ ssh hduser@node5
 $ sudo mkdir /opt/bigdata
 $ sudo chown -R hduser:hadoop /opt/bigdata
 $ exit

 ...
 # node4 node3 node2
 ...

 $ ssh hduser@node1
 $ sudo mkdir /opt/bigdata
 $ sudo chown -R hduser:node1 /opt/bigdata
 $ exit

unzip and config scp

 $ tar -zxvf hadoop-2.7.0.tar.gz
 $ mv hadoop-2.7.0 /opt/bigdata/hadoop

 $ scp core-site.xml hdfs-site.xml mapred-site.xml slaves yarn-site.xml \
 hduser@node5:/opt/bigdata/hadoop/etc/hadoop

 $ pico .bashrc
 ...
 export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/jre
 export HADOOP_PREFIX="/opt/bigdata/hadoop"
 export HADOOP_HOME=$HADOOP_PREFIX
 export HADOOP_COMMON_HOME=$HADOOP_PREFIX
 export HADOOP_CONF_DIR=$HADOOP_PREFIX/etc/hadoop
 export HADOOP_HDFS_HOME=$HADOOP_PREFIX
 export HADOOP_MAPRED_HOME=$HADOOP_PREFIX
 export HADOOP_YARN_HOME=$HADOOP_PREFIX
 export PATH=$PATH:$HADOOP_HOME
 export PATH=$PATH:$JAVA_HOME
 ...

sync folder

 $ scp .bashrc hduser@node4:~/
 $ scp .bashrc hduser@node3:~/
 $ scp .bashrc hduser@node2:~/
 $ scp .bashrc hduser@node1:~/

 $ scp -r /opt/bigdata/hadoop hduser@node4:/opt/bigdata/hadoop
 $ scp -r /opt/bigdata/hadoop hduser@node3:/opt/bigdata/hadoop
 $ scp -r /opt/bigdata/hadoop hduser@node2:/opt/bigdata/hadoop
 $ scp -r /opt/bigdata/hadoop hduser@node1:/opt/bigdata/hadoop
 $ scp -r /opt/bigdata/hadoop/etc/hadoop hduser@node4:/opt/bigdata/hadoop/etc
 $ scp -r /opt/bigdata/hadoop/etc/hadoop hduser@node3:/opt/bigdata/hadoop/etc
 $ scp -r /opt/bigdata/hadoop/etc/hadoop hduser@node2:/opt/bigdata/hadoop/etc
 $ scp -r /opt/bigdata/hadoop/etc/hadoop hduser@node1:/opt/bigdata/hadoop/etc

config and start

config

core-site.xml

 <?xml version="1.0" encoding="UTF-8"?>
 <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 <configuration>
     <property>
         <name>fs.default.name</name>
         <value>hdfs://node5:9000</value>
     </property>
     <property>
         <name>hadoop.tmp.dir</name>
         <value>file:///app/hadoop/tmp</value>
     </property>
 </configuration>

hdfs-site.xml

 <?xml version="1.0" encoding="UTF-8"?>
 <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 <configuration>
     <property>
         <name>dfs.namenode.checkpoint.period</name>
         <value>3600</value>
     </property>
     <property>
         <name>dfs.namenode.name.dir</name>
         <value>file:///app/hadoop/hdfs/name</value>
         <final>true</final>
     </property>
     <property>
         <name>dfs.datanode.data.dir</name>
         <value>file:///app/hadoop/hdfs/data</value>
         <final>true</final>
     </property>
     <property>
         <name>dfs.blocksize</name>
         <value>134217728</value>
     </property>
     <property>
         <name>dfs.replication</name>
         <value>3</value>
     </property>
     <property>
         <name>dfs.permissions</name>
         <value>false</value>
     </property>
     <property>
         <name>dfs.namenode.handler.count</name>
         <value>50</value>
     </property>
     <property>
         <name>dfs.namenode.checkpoint.dir</name>
         <value>file:///app/hadoop/hdfs/namesecondary</value>
     </property>
 </configuration>

yarn-site.xml

 <?xml version="1.0"?>
 <configuration>
     <property>
         <name>yarn.nodemanager.aux-services</name>
         <value>mapreduce_shuffle</value>
     </property>
     <property>
         <name>yarn.nodemanager.aux-services.mapreduce.shuffle.class</name>
         <value>org.apache.hadoop.mapred.ShuffleHandler</value>
     </property>
     <property>
         <name>yarn.resourcemanager.resource-tracker.address</name>
         <value>node5:8025</value>
     </property>
     <property>
         <name>yarn.resourcemanager.scheduler.address</name>
         <value>node5:8030</value>
     </property>
     <property>
         <name>yarn.resourcemanager.address</name>
         <value>node5:8040</value>
     </property>
 </configuration>

mapred-site.xml

 <?xml version="1.0"?>
 <configuration>
     <property>
         <name>mapreduce.framework.name</name>
         <value>yarn</value>
     </property>
 </configuration>

slaves
```
 node4
 node3
 node2
 node1
```

format and start

on node5 format hdfs

 $ bin/hdfs namenode -format

 # exit safemode
 $ bin/hdfs dfsadmin -report
 Configured Capacity: 5854310649856 (5.32 TB)
 Present Capacity: 5532915458048 (5.03 TB)
 DFS Remaining: 5532915384320 (5.03 TB)
 DFS Used: 73728 (72 KB)
 DFS Used%: 0.00%
 Under replicated blocks: 0
 Blocks with corrupt replicas: 0
 Missing blocks: 0
 Missing blocks (with replication factor 1): 0

 -------------------------------------------------
 Live datanodes (3):

 Name: 192.168.120.154:50010 (node4)
 Hostname: node4
 Decommission Status : Normal
 Configured Capacity: 1951699709952 (1.78 TB)
 DFS Used: 24576 (24 KB)
 Non DFS Used: 103953088512 (96.81 GB)
 DFS Remaining: 1847746596864 (1.68 TB)
 DFS Used%: 0.00%
 DFS Remaining%: 94.67%
 Configured Cache Capacity: 0 (0 B)
 Cache Used: 0 (0 B)
 Cache Remaining: 0 (0 B)
 Cache Used%: 100.00%
 Cache Remaining%: 0.00%
 Xceivers: 1
 Last contact: Wed Jul 08 08:50:45 CST 2015


 Name: 192.168.120.152:50010 (node2)
 Hostname: node2
 Decommission Status : Normal
 Configured Capacity: 1951307567104 (1.77 TB)
 DFS Used: 24576 (24 KB)
 Non DFS Used: 104823349248 (97.62 GB)
 DFS Remaining: 1846484193280 (1.68 TB)
 DFS Used%: 0.00%
 DFS Remaining%: 94.63%
 Configured Cache Capacity: 0 (0 B)
 Cache Used: 0 (0 B)
 Cache Remaining: 0 (0 B)
 Cache Used%: 100.00%
 Cache Remaining%: 0.00%
 Xceivers: 1
 Last contact: Wed Jul 08 08:50:45 CST 2015


 Name: 192.168.120.153:50010 (node3)
 Hostname: node3
 Decommission Status : Normal
 Configured Capacity: 1951303372800 (1.77 TB)
 DFS Used: 24576 (24 KB)
 Non DFS Used: 112618754048 (104.88 GB)
 DFS Remaining: 1838684594176 (1.67 TB)
 DFS Used%: 0.00%
 DFS Remaining%: 94.23%
 Configured Cache Capacity: 0 (0 B)
 Cache Used: 0 (0 B)
 Cache Remaining: 0 (0 B)
 Cache Used%: 100.00%
 Cache Remaining%: 0.00%
 Xceivers: 1
 Last contact: Wed Jul 08 08:50:45 CST 2015

on node5 start

 $ sbin/start-dfs.sh
 $ sbin/start-yarn.sh

on node5 stop

 $ sbin/stop-yarn.sh
 $ sbin/stop-dfs.sh

5. setup zookeeper

topology

         node2         node3         node4
     +-----------+ +-----------+ +-----------+
     |           | |           | |           |
     | ZooKeeper | | ZooKeeper | | ZooKeeper |
     |           | |           | |           |
     +-----------+ +-----------+ +-----------+

setup

copy software to node4

 $ scp zookeeper-3.4.6.tar.gz hduser@node4:/opt/bigdata
 $ cd /opt/bigdata
 $ tar -zxf zookeeper-3.4.6.tar.gz
 $ mv zookeeper-3.4.6 zookeeper

config zookeeper

 $ pico /opt/bigdata/zookeeper/conf/zoo.cfg
 tickTime=2000
 dataDir=/app/hadoop/zookeeper/data
 clientPort=2181
 initLimit=10
 syncLimit=5
 server.4=node4:20010:20011
 server.3=node3:20010:20011
 server.2=node2:20010:20011

copy zookeeper to node3, node2

 $ scp /opt/bigdata/zookeeper hduser@node3:/opt/bigdata
 $ scp /opt/bigdata/zookeeper hduser@node2:/opt/bigdata

mkdir for zookeeper data

 # on node2
 $ mkdir -p /app/hadoop/zookeeper/data
 $ chown -R hduser@hadoop /app/hadoop/zookeeper

 # on node3
 $ mkdir -p /app/hadoop/zookeeper/data
 $ chown -R hduser@hadoop /app/hadoop/zookeeper

 # on node4
 $ mkdir -p /app/hadoop/zookeeper/data
 $ chown -R hduser@hadoop /app/hadoop/zookeeper

config node’s myid

 # on node2
 $ pico /app/hadoop/zookeeper/data/myid
 2

 # on node3
 $ pico /app/hadoop/zookeeper/data/myid
 3

 # on node4
 $ pico /app/hadoop/zookeeper/data/myid
 4

start status stop

start zookeeper

 # on node2
 $ /opt/bigdata/zookeeper/bin/zkServer.sh start

 # on node3
 $ /opt/bigdata/zookeeper/bin/zkServer.sh start

 # on node4
 $ /opt/bigdata/zookeeper/bin/zkServer.sh start

check zookeeper status

 # on node2
 $ /opt/bigdata/zookeeper/bin/zkServer.sh status
 JMX enabled by default
 Using config: /opt/bigdata/zookeeper/bin/../conf/zoo.cfg
 Mode: follower

 # on node3
 $ /opt/bigdata/zookeeper/bin/zkServer.sh status
 JMX enabled by default
 Using config: /opt/bigdata/zookeeper/bin/../conf/zoo.cfg
 Mode: follower

 # on node4
 $ /opt/bigdata/zookeeper/bin/zkServer.sh status
 JMX enabled by default
 Using config: /opt/bigdata/zookeeper/bin/../conf/zoo.cfg
 Mode: leader

check zookeeper status

 # use `echo` and `nc`
 $ echo ruok | nc node2 2181
 imok

 $ echo status | nc node2 2181
 Zookeeper version: 3.4.6-1569965, built on 02/20/2014 09:09 GMT
 Clients:
  /192.168.120.154:47757[1](queued=0,recved=25564,sent=25564)
  /192.168.120.153:44776[1](queued=0,recved=25565,sent=25565)
  /192.168.120.152:54094[1](queued=0,recved=25565,sent=25565)
  /192.168.120.152:38021[0](queued=0,recved=1,sent=0)
  /192.168.120.152:54089[1](queued=0,recved=25648,sent=25662)
  /192.168.120.155:35662[1](queued=0,recved=25565,sent=25565)
  /192.168.120.153:44772[1](queued=0,recved=25566,sent=25566)

 Latency min/avg/max: 0/0/36
 Received: 170608
 Sent: 170657
 Connections: 7
 Outstanding: 0
 Zxid: 0x100000117
 Mode: follower
 Node count: 42

 $ echo status | nc node3 2181
 Zookeeper version: 3.4.6-1569965, built on 02/20/2014 09:09 GMT
 Clients:
  /192.168.120.154:39911[1](queued=0,recved=227,sent=227)
  /192.168.120.152:48327[0](queued=0,recved=1,sent=0)

 Latency min/avg/max: 0/0/14
 Received: 235
 Sent: 234
 Connections: 2
 Outstanding: 0
 Zxid: 0x100000117
 Mode: follower
 Node count: 42

 Zookeeper version: 3.4.6-1569965, built on 02/20/2014 09:09 GMT
 Clients:
  /192.168.120.155:53241[1](queued=0,recved=29290,sent=29290)
  /192.168.120.153:40275[1](queued=0,recved=25644,sent=25652)
  /192.168.120.152:54308[1](queued=0,recved=25570,sent=25570)
  /192.168.120.155:53240[1](queued=0,recved=25568,sent=25568)
  /192.168.120.154:48195[1](queued=0,recved=25625,sent=25634)
  /192.168.120.152:38243[0](queued=0,recved=1,sent=0)
  /192.168.120.155:53234[1](queued=0,recved=31166,sent=31203)
  /192.168.120.154:48202[1](queued=0,recved=25568,sent=25568)

 Latency min/avg/max: 0/0/110
 Received: 218610
 Sent: 218675
 Connections: 8
 Outstanding: 0
 Zxid: 0x100000117
 Mode: leader
 Node count: 42

stop zookeeper

 # on node2
 $ /opt/bigdata/zookeeper/bin/zkServer.sh stop

 # on node3
 $ /opt/bigdata/zookeeper/bin/zkServer.sh stop

 # on node4
 $ /opt/bigdata/zookeeper/bin/zkServer.sh stop

6. setup hbase

topology

                           node5
                      +--------------+
                      |              |
                      |    Master    |
                      |              |
                      +--------------+
                     /       |        \
                    /        |         \
                   /         |          \
                  /          |           \
                 /           |            \
                /            |             \
           node2            node3           node4
     +--------------+ +--------------+ +--------------+
     |              | |              | |              |
     | ZooKeeper    | | ZooKeeper    | | ZooKeeper    |
     | RegionServer | | RegionServer | | RegionServer |
     |              | |              | |              |
     +--------------+ +--------------+ +--------------+

setup

copy software to node5

 $ scp hbase-1.0.1.1-bin.tar.gz hduser@node4:/opt/bigdata
 $ cd /opt/bigdata
 $ tar -zxf hbase-1.0.1.1-bin.tar.gz
 $ mv hbase-1.0.1.1 hbase

config hbase

 $ pico /opt/bigdata/hbase/conf/regionservers
 node4
 node3
 node2

 $ pico /opt/bigdata/hbase/conf/backup-masters
 node2

 $ pico /opt/bigdata/hbase/conf/hbase-site.xml
 <?xml version="1.0"?>
 <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 <configuration>
     <property>
         <name>hbase.zookeeper.quorum</name>
         <value>node4,node3,node2</value>
     </property>
     <property>
         <name>hbase.rootdir</name>
         <value>hdfs://node5:9000/hbase</value>
     </property> 
     <property>
         <name>hbase.cluster.distributed</name>
         <value>true</value>
     </property>
 </configuration>

copy hbase to node4, node3, node2

 $ scp /opt/bigdata/hbase hduser@node4:/opt/bigdata
 $ scp /opt/bigdata/hbase hduser@node3:/opt/bigdata
 $ scp /opt/bigdata/hbase hduser@node2:/opt/bigdata

start stop

start

 # on node5
 $ /opt/bigdata/hbase/bin/start-hbase.sh

stop

 # on node5
 $ /opt/bigdata/hbase/bin/stop-hbase.sh

7. setup sqoop

topology

           node2            node3    
     +--------------+ +--------------+
     |              | |              |
     | sqoop server | | sqoop server |
     |              | |              |
     +--------------+ +--------------+

setup

copy software to node2

 $ scp sqoop-1.99.6-bin-hadoop200.tar.gz hduser@node2:/opt/bigdata
 $ cd /opt/bigdata
 $ tar -zxf sqoop-1.99.6-bin-hadoop200.tar.gz
 $ mv sqoop-1.99.6-bin-hadoop200 sqoop

config sqoop

 # sqoop/server/conf/catalina.properties
 # hadoop's common    and common's    lib
 # hadoop's hdfs      and hdfs's      lib
 # hadoop's mapreduce and mapreduce's lib
 # hadoop's yarn      and yarn's      lib
 $ sed 's@/usr/lib/hadoop/\*.jar@/opt/bigdata/hadoop/share/hadoop/common/\*.jar@' -i /opt/bigdata/sqoop/server/conf/catalina.properties
 $ sed 's@/usr/lib/hadoop/lib/\*.jar@/opt/bigdata/hadoop/share/hadoop/common/lib/\*.jar@' -i /opt/bigdata/sqoop/server/conf/catalina.properties
 $ sed 's@/usr/lib/hadoop-hdfs/\*.jar@/opt/bigdata/hadoop/share/hadoop/hdfs/\*.jar@' -i /opt/bigdata/sqoop/server/conf/catalina.properties
 $ sed 's@/usr/lib/hadoop-hdfs/lib/\*.jar@/opt/bigdata/hadoop/share/hadoop/hdfs/lib/\*.jar@' -i /opt/bigdata/sqoop/server/conf/catalina.properties
 $ sed 's@/usr/lib/hadoop-mapreduce/\*.jar@/opt/bigdata/hadoop/share/hadoop/mapreduce/\*.jar@' -i /opt/bigdata/sqoop/server/conf/catalina.properties
 $ sed 's@/usr/lib/hadoop-mapreduce/lib/\*.jar@/opt/bigdata/hadoop/share/hadoop/mapreduce/lib/\*.jar@' -i /opt/bigdata/sqoop/server/conf/catalina.properties
 $ sed 's@/usr/lib/hadoop-yarn/\*.jar@/opt/bigdata/hadoop/share/hadoop/yarn/\*.jar@' -i /opt/bigdata/sqoop/server/conf/catalina.properties
 $ sed 's@/usr/lib/hadoop-yarn/lib/\*.jar@/opt/bigdata/hadoop/share/hadoop/yarn/lib/\*.jar@' -i /opt/bigdata/sqoop/server/conf/catalina.properties
        
 # sqoop/server/conf/sqoop.properties
 # hadoop's conf dir
 $ sed 's@/etc/hadoop/conf/@/opt/bigdata/hadoop/etc/hadoop/@' -i /opt/bigdata/sqoop/server/conf/sqoop.properties

verify

 $ /opt/bigdata/sqoop/bin/sqoop2-tool verify
 ...
 Verification was successful.
 Tool class org.apache.sqoop.tools.tool.VerifyTool has finished correctly

start stop sqoop server

start

 $ /opt/bigdata/sqoop/bin/sqoop2-server start

 Sqoop home directory: /opt/bigdata/sqoop
 Setting SQOOP_HTTP_PORT:     12000
 Setting SQOOP_ADMIN_PORT:     12001
 Using   CATALINA_OPTS:       
 Adding to CATALINA_OPTS:    -Dsqoop.http.port=12000 -Dsqoop.admin.port=12001
 Using CATALINA_BASE:   /opt/bigdata/sqoop/server
 Using CATALINA_HOME:   /opt/bigdata/sqoop/server
 Using CATALINA_TMPDIR: /opt/bigdata/sqoop/server/temp
 Using JRE_HOME:        /usr/lib/jvm/java-8-openjdk-amd64/jre
 Using CLASSPATH:       /opt/bigdata/sqoop/server/bin/bootstrap.jar

stop

 $ /opt/bigdata/sqoop/bin/sqoop2-server stop

sqoop2-shell

start
```
 $ /opt/bigdata/sqoop/bin/sqoop2-shell
```
set server
```
 sqoop:000> set server --host node2
```

version

 # 1. version
 sqoop:000> show version      
 client version:
   Sqoop 1.99.6 source revision 07244c3915975f26f03d9e1edf09ab7d06619bb8 
   Compiled by root on Wed Apr 29 10:40:43 CST 2015
 sqoop:000> show version --all  
 client version:
   Sqoop 1.99.6 source revision 07244c3915975f26f03d9e1edf09ab7d06619bb8 
   Compiled by root on Wed Apr 29 10:40:43 CST 2015
 server version:
   Sqoop 1.99.6 source revision 07244c3915975f26f03d9e1edf09ab7d06619bb8 
   Compiled by root on Wed Apr 29 10:40:43 CST 2015
 API versions:
   [v1]

connector

 sqoop:000> show connector      
 +----+------------------------+---------+------------------------------------------------------+----------------------+
 | Id |          Name          | Version |                        Class                         | Supported Directions |
 +----+------------------------+---------+------------------------------------------------------+----------------------+
 | 1  | generic-jdbc-connector | 1.99.6  | org.apache.sqoop.connector.jdbc.GenericJdbcConnector | FROM/TO              |
 | 2  | kite-connector         | 1.99.6  | org.apache.sqoop.connector.kite.KiteConnector        | FROM/TO              |
 | 3  | hdfs-connector         | 1.99.6  | org.apache.sqoop.connector.hdfs.HdfsConnector        | FROM/TO              |
 | 4  | kafka-connector        | 1.99.6  | org.apache.sqoop.connector.kafka.KafkaConnector      | TO                   |
 +----+------------------------+---------+------------------------------------------------------+----------------------+

link - hdfs

 sqoop:000> create link -c 3
 Creating link for connector with id 3
 Please fill following values to create new link object
 Name: hdfs

 Link configuration

 HDFS URI: hdfs://node5:9000
 Hadoop conf directory: /opt/bigdata/hadoop/etc/hadoop 
 New link was successfully created with validation status OK and persistent id 1
 sqoop:000> show link
 +----+------+--------------+----------------+---------+
 | Id | Name | Connector Id | Connector Name | Enabled |
 +----+------+--------------+----------------+---------+
 | 1  | hdfs | 3            | hdfs-connector | true    |
 +----+------+--------------+----------------+---------+

link - mysql

 sqoop:000> create link -c 1
 Creating link for connector with id 1
 Please fill following values to create new link object
 Name: mysql

 Link configuration

 JDBC Driver Class: com.mysql.jdbc.Driver
 JDBC Connection String: jdbc:mysql://node3/sqoop
 Username: sa
 Password: **
 JDBC Connection Properties: 
 There are currently 0 values in the map:
 entry# protocol=tcp
 There are currently 1 values in the map:
 protocol = tcp
 entry# 

 There were warnings while create or update, but saved successfully.
 Warning message: Can't connect to the database with given credentials: Communications link failure

 The last packet sent successfully to the server was 0 milliseconds ago. The driver has not received any packets from the server. 
 New link was successfully created with validation status WARNING and persistent id 2
 sqoop:000> show link
 +----+-------+--------------+------------------------+---------+
 | Id | Name  | Connector Id |     Connector Name     | Enabled |
 +----+-------+--------------+------------------------+---------+
 | 1  | hdfs  | 3            | hdfs-connector         | true    |
 | 2  | mysql | 1            | generic-jdbc-connector | true    |
 +----+-------+--------------+------------------------+---------+

link - sqlserver

 sqoop:000> create link -c 1
 Creating link for connector with id 1
 Please fill following values to create new link object
 Name: mssql

 Link configuration

 JDBC Driver Class: com.microsoft.sqlserver.jdbc.SQLServerDriver
 JDBC Connection String: jdbc:sqlserver://192.168.120.151
 Username: sa
 Password: **
 JDBC Connection Properties: 
 There are currently 0 values in the map:
 entry# protocol=tcp 
 There are currently 1 values in the map:
 protocol = tcp
 entry# 
 New link was successfully created with validation status OK and persistent id 3
 sqoop:000> show link
 +----+-------+--------------+------------------------+---------+
 | Id | Name  | Connector Id |     Connector Name     | Enabled |
 +----+-------+--------------+------------------------+---------+
 | 1  | hdfs  | 3            | hdfs-connector         | true    |
 | 2  | mysql | 1            | generic-jdbc-connector | true    |
 | 3  | mssql | 1            | generic-jdbc-connector | true    |
 +----+-------+--------------+------------------------+---------+

create sqoop job to ingest data from mysql to hdfs

 sqoop:000> create job --from 2 --to 1
 Creating job for links with from id 2 and to id 1
 Please fill following values to create new job object
 Name: mysql_to_hdfs

 From database configuration

 Schema name: sqoop
 Table name: employee
 Table SQL statement: 
 Table column names: 
 Partition column name: 
 Null value allowed for the partition column: 
 Boundary query: 

 Incremental read

 Check column: 
 Last value: 

 To HDFS configuration

 Override null value: 
 Null value: 
 Output format: 
   0 : TEXT_FILE
   1 : SEQUENCE_FILE
 Choose: 0
 Compression format: 
   0 : NONE
   1 : DEFAULT
   2 : DEFLATE
   3 : GZIP
   4 : BZIP2
   5 : LZO
   6 : LZ4
   7 : SNAPPY
   8 : CUSTOM
 Choose: 0
 Custom compression format: 
 Output directory: /user/hduser
 Append mode: 

 Throttling resources

 Extractors: 
 Loaders: 
 New job was successfully created with validation status OK  and persistent id 1
 sqoop:000> show job
 +----+---------------+----------------+--------------+---------+
 | Id |     Name      | From Connector | To Connector | Enabled |
 +----+---------------+----------------+--------------+---------+
 | 1  | mysql_to_hdfs | 1              | 3            | true    |
 +----+---------------+----------------+--------------+---------+

start job and check status

 sqoop:000> start job -j 1 
 Submission details
 Job ID: 1
 Server URL: http://node2:12000/sqoop/
 Created by: hduser
 Creation date: 2015-07-09 17:26:59 CST
 Lastly updated by: hduser
 External ID: job_1436433861773_0001
     http://node5:8088/proxy/application_1436433861773_0001/
 2015-07-09 17:26:59 CST: BOOTING  - Progress is not available

 sqoop:000> status job -j 1

8. setup spark

topology

                           node5
                      +--------------+
                      |              |
                      | spark master |
                      |              |
                      +--------------+
                     /       |        \
                    /        |         \
                   /         |          \
                  /          |           \
                 /           |            \
                /            |             \
           node2            node3           node4
     +--------------+ +--------------+ +--------------+
     |              | |              | |              |
     | spark worker | | spark worker | | spark worker |
     |              | |              | |              |
     +--------------+ +--------------+ +--------------+

setup

copy software to node5

 $ scp spark-1.3.1-bin-hadoop2.6.tgz hduser@node5:/opt/bigdata
 $ cd /opt/bigdata
 $ tar -zxf spark-1.3.1-bin-hadoop2.6.tgz
 $ mv spark-1.3.1-bin-hadoop2.6 spark

 $ scp -r /opt/bigdata/spark hduser@node4:/opt/bigdata/spark
 $ scp -r /opt/bigdata/spark hduser@node3:/opt/bigdata/spark
 $ scp -r /opt/bigdata/spark hduser@node2:/opt/bigdata/spark

start spark master on node5

 $ /opt/bigdata/spark/sbin/start-master.sh

start spark worker on node4 node3 node2

 # on node4
 $ /opt/bigdata/spark/bin/spark-class org.apache.spark.deploy.worker.Worker spark://node5:7077

 # on node3
 $ /opt/bigdata/spark/bin/spark-class org.apache.spark.deploy.worker.Worker spark://node5:7077

 # on node2
 $ /opt/bigdata/spark/bin/spark-class org.apache.spark.deploy.worker.Worker spark://node5:7077

demo

 $ /opt/bigdata/spark/bin/spark-submit \
 --class org.apache.spark.examples.SparkPi \
 --master yarn-cluster \
 --num-executors 3 \
 --driver-memory 4g \
 --executor-memory 2g \
 --executor-cores 1 \
 --queue thequeue \
 /opt/bigdata/spark/lib/spark-examples*.jar \
 10

fixed

problem connecting to server

log on node3

 2015-07-08 08:20:39,044 WARN org.apache.hadoop.hdfs.server.datanode.DataNode: Problem connecting to server: node5/192.168.120.155:9000

 2015-07-08 08:20:48,046 INFO org.apache.hadoop.ipc.Client: Retrying connect to server: node5/192.168.120.155:9000. Already tried 3 time(s); retry policy is RetryUpToMaximumCountWithFixedSleep(maxRetries=10, sleepTime=1000 MILLISECONDS)

datanode can’t talk to namenode

hdfs service failed to start, issues with datanode and namenode

 Harsh J

 Sorry, should've sent you a direct string… I mean something like this: 

 127.0.0.1 localhost johniv-able 
 # 127.0.1.1 johniv-able 
 # ^^^^^^^^^^^^^^^^^^^^^

ERROR 2003 (HY000): Can’t connect to MySQL server on ‘node3’ (61)

ubuntuforums
stackoverflow error-2003

stackoverflow error-61

 # 1. check status
 $ netstat -tulpen
 (Not all processes could be identified, non-owned process info
  will not be shown, you would have to be root to see it all.)
 Active Internet connections (only servers)
 Proto Recv-Q Send-Q Local Address           Foreign Address         State       User       Inode       PID/Program name
 tcp        0      0 0.0.0.0:22              0.0.0.0:*               LISTEN      0          20360       -               
 tcp        0      0 0.0.0.0:3306            0.0.0.0:*               LISTEN      121        31073       -               
 tcp        0      0 0.0.0.0:50010           0.0.0.0:*               LISTEN      1001       29143       4193/java       
 tcp        0      0 0.0.0.0:50020           0.0.0.0:*               LISTEN      1001       33985       4193/java       
 tcp        0      0 0.0.0.0:50075           0.0.0.0:*               LISTEN      1001       29188       4193/java       
 tcp        0      0 127.0.0.1:48933         0.0.0.0:*               LISTEN      1001       29149       4193/java       
 tcp        0      0 127.0.0.1:5939          0.0.0.0:*               LISTEN      0          18701       -               
 tcp        0      0 127.0.0.1:631           0.0.0.0:*               LISTEN      0          120296      -               
 tcp6       0      0 192.168.120.153:16020   :::*                    LISTEN      1001       33404       4416/java       
 tcp6       0      0 ::1:631                 :::*                    LISTEN      0          120295      -               
 tcp6       0      0 :::13562                :::*                    LISTEN      1001       33608       4840/java       
 tcp6       0      0 :::16030                :::*                    LISTEN      1001       33425       4416/java       
 tcp6       0      0 :::22                   :::*                    LISTEN      0          20362       -               
 tcp6       0      0 :::52600                :::*                    LISTEN      1001       41017       4840/java       
 tcp6       0      0 :::8040                 :::*                    LISTEN      1001       33604       4840/java       
 tcp6       0      0 :::8042                 :::*                    LISTEN      1001       33609       4840/java       
 udp        0      0 0.0.0.0:41761           0.0.0.0:*                           107        16926       -               
 udp        0      0 0.0.0.0:5353            0.0.0.0:*                           107        16924       -               
 udp        0      0 0.0.0.0:631             0.0.0.0:*                           0          18693       -               
 udp6       0      0 :::52756                :::*                                107        16927       -               
 udp6       0      0 :::5353                 :::*                                107        16925       -               

 # 2. refer edit `my.cnf`

edit my.cnf

 $ sudo pico /etc/alternatives/my.cnf
 or
 $ sudo pico /etc/mysql/my.cnf
 # You can copy this to one of:
 # - "/etc/mysql/my.cnf" to set global options,
 # - "~/.my.cnf" to set user-specific options.

 [mysqld]
 bind-address = 0.0.0.0

grant privileges

 mysql> grant all privileges on *.* to 'sa'@'%' identified by 'sa';

restart mysql service

 $ /etc/init.d/mysql restart
 or
 $ service mysql restart

Connection refused

java.net.ConnectException

 sqoop:000> start job -j 1
 2015-07-09 16:30:32 CST: FAILURE_ON_SUBMIT 
 Exception: java.net.ConnectException: Call From node2/192.168.120.152 to node5:8040 failed on connection exception: java.net.ConnectException: Connection refused; For more details see:  http://wiki.apache.org/hadoop/ConnectionRefused

ConnectionRefused

fixed

 # yarn not started
 # on node5
 $ cd /opt/bigdata/hadoop/etc/hadoop
 $ grep 8040 ./*
 ./yarn-site.xml:        <value>node5:8040</value>

 # so start yarn
 $ /opt/bigdata/hadoop/sbin/start-yarn.sh

Class not found

Error message: Class not found

 sqoop:000> create link -c 1
 Creating link for connector with id 1
 Please fill following values to create new link object
 Name: mssql

 Link configuration

 JDBC Driver Class: com.microsoft.jdbc.sqlserver 
 JDBC Connection String: jdbc:sqlserver://192.168.120.151
 Username: sa
 Password: **
 JDBC Connection Properties: 
 There are currently 0 values in the map:
 entry# protocol=tcp
 There are currently 1 values in the map:
 protocol = tcp
 entry# 

  There are issues with entered data, please revise your input:
 Name: mssql       

 Link configuration

 Error message: Class not found 
 JDBC Driver Class: com.microsoft.jdbc.sqlserver.SQLServerDriver

fixed

 # 1. find  `sqljdbc4.jar`
 # 2. unzip `sqljdbc4.jar`
 # 3. check path
 # so sqlserver's driver class is
 com.microsoft.sqlserver.jdbc.SQLServerDriver