06 July 2015

steps

  1. install ubuntu 15.04 desktop on 5 nodes

  2. setup ssh

  3. setup hadoop

  4. config and start hadoop

  5. setup zookeeper

  6. setup hbase

  7. setup sqoop

  8. setup spark

  9. fixed

1. setup ubuntu

  1. download and install ubuntu desktop on 5 nodes

  2. setup static ip address for 5 nodes

    1. node5

       $ sudo pico /etc/network/interfaces
       # interfaces(5) file used by ifup(8) and ifdown(8)
       auto lo
       iface lo inet loopback
      
       auto eth0
       iface eth0 inet static
       address 192.168.120.155
       netmask 255.255.255.0
       network 192.168.120.0
       broadcast 192.168.120.255
       gateway 192.168.120.1
       dns-nameservers 192.168.10.220 192.168.10.221
      
       $ sudo pico /etc/resolv.conf 
       # Dynamic resolv.conf(5) file for glibc resolver(3) generated by resolvconf(8)
       #     DO NOT EDIT THIS FILE BY HAND -- YOUR CHANGES WILL BE OVERWRITTEN
      
       nameserver 192.168.10.220
       nameserver 192.168.10.221
      
       $ sudo pico /etc/hosts
       127.0.0.1   localhost
       127.0.1.1   node5
      
       # The following lines are desirable for IPv6 capable hosts
       ::1     ip6-localhost ip6-loopback
       fe00::0 ip6-localnet
       ff00::0 ip6-mcastprefix
       ff02::1 ip6-allnodes
       ff02::2 ip6-allrouters
      
       192.168.120.155 node5
       192.168.120.154 node4
       192.168.120.153 node3
       192.168.120.152 node2
       192.168.120.151 node1
      
       $ ping www.baidu.com
      
    2. node1 node2 node3 node4

       # same as node5
      
    3. mac os x yosemite

       $ pico /private/etc/hosts
       ##
       # Host Database
       #
       # localhost is used to configure the loopback interface
       # when the system is booting.  Do not change this entry.
       ##
       127.0.0.1   localhost
       255.255.255.255 broadcasthost
       ::1             localhost 
      
       192.168.120.155 node5
       192.168.120.154 node4
       192.168.120.153 node3
       192.168.120.152 node2
       192.168.120.151 node1
      
  3. change apt sources

         $ sudo sed 's@cn.archive.ubuntu.com@mirrors.163.com@' -i /etc/apt/sources.list
         $ sudo sed 's@security.ubuntu.com@mirrors.163.com@' -i /etc/apt/sources.list
    
  4. set HADOOP_HOME JAVA_HOME

         $ sudo pico .bashrc
    
         ...
         export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/jre
         export HADOOP_HOME=~/node5/hadoop
         export PATH=$PATH:$HADOOP_HOME
         export PATH=$PATH:$JAVA_HOME
         ...
    
         $ source .bashrc
    
  5. firewall

    1. check status of firewall

       $ sudo ufw status
      
    2. stop iptables service

       $ sudo ufw disable
      
    3. reload / restart iptables service

       $ sudo ufw reload
      
    4. get ipv4 iptables status

       $ sudo iptables -L -n -v
      
    5. get ipv6 ip6tables status

       $ sudo ip6tables -L -n -v
      
  6. add user

         $ sudo addgroup hadoop
         $ sudo adduser --ingroup hadoop hduser
         $ sudo chown -R hduser:hadoop /opt/bigdata
    
  7. sublime text

    1. for sublime-text-2:

       $ sudo add-apt-repository ppa:webupd8team/sublime-text-2
       $ sudo apt-get update
       $ sudo apt-get install sublime-text
      
    2. for sublime-text-3:

       $ sudo add-apt-repository ppa:webupd8team/sublime-text-3
       $ sudo apt-get update
       $ sudo apt-get install sublime-text-installer
      
  8. install java on five nodes

         $ sudo apt-get install openjdk-7-jdk
    

2. setup ssh

  1. install openssh-server

    1. node1 to node5

       $ sudo apt-get install openssh-server
      
  2. generate ssh key

    1. node5

       $ su - hduser
       $ ssh-keygen -t rsa -P "" 
       $ cd ~/.ssh
       $ cat id_rsa.pub >> authorized_keys
      
  3. setup passphraseless ssh

         $ ssh-copy-id -i id_rsa.pub hduser@node1
         $ ssh-copy-id -i id_rsa.pub hduser@node2
         $ ssh-copy-id -i id_rsa.pub hduser@node3
         $ ssh-copy-id -i id_rsa.pub hduser@node4
    
  4. check ssh login

         $ ssh hduser@node1
         ...
         exit
    
         $ ssh hduser@node2
         ...
         exit
    
         $ ssh hduser@node3
         ...
         exit
    
         $ ssh hduser@node4
         ...
         exit
    
  5. remove key from known hosts

         $ rm -f .ssh/known_hosts
    
         or
    
         $ ssh-keygen -R "hduser"
    

3. setup hadoop

  1. topology

                               node5
                          +--------------+
                          |              |
                          |   NameNode   |
                          |              |
                          +--------------+
                         /       |        \
                        /        |         \
                       /         |          \
                      /          |           \
                     /           |            \
                    /            |             \
               node2            node3           node4
         +--------------+ +--------------+ +--------------+
         |              | |              | |              |
         |  DataNode    | |  DataNode    | |  DataNode    |
         |              | |              | |              |
         +--------------+ +--------------+ +--------------+
    
  2. setup

    1. copy software to node

       $ scp hadoop-2.7.0.tar.gz hduser@node5:~/
      
    2. mkdir to install hadoop

       $ ssh hduser@node5
       $ sudo mkdir /opt/bigdata
       $ sudo chown -R hduser:hadoop /opt/bigdata
       $ exit
      
       ...
       # node4 node3 node2
       ...
      
       $ ssh hduser@node1
       $ sudo mkdir /opt/bigdata
       $ sudo chown -R hduser:node1 /opt/bigdata
       $ exit
      
    3. unzip and config scp

       $ tar -zxvf hadoop-2.7.0.tar.gz
       $ mv hadoop-2.7.0 /opt/bigdata/hadoop
      
       $ scp core-site.xml hdfs-site.xml mapred-site.xml slaves yarn-site.xml \
       hduser@node5:/opt/bigdata/hadoop/etc/hadoop
      
       $ pico .bashrc
       ...
       export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/jre
       export HADOOP_PREFIX="/opt/bigdata/hadoop"
       export HADOOP_HOME=$HADOOP_PREFIX
       export HADOOP_COMMON_HOME=$HADOOP_PREFIX
       export HADOOP_CONF_DIR=$HADOOP_PREFIX/etc/hadoop
       export HADOOP_HDFS_HOME=$HADOOP_PREFIX
       export HADOOP_MAPRED_HOME=$HADOOP_PREFIX
       export HADOOP_YARN_HOME=$HADOOP_PREFIX
       export PATH=$PATH:$HADOOP_HOME
       export PATH=$PATH:$JAVA_HOME
       ...
      
    4. sync folder

       $ scp .bashrc hduser@node4:~/
       $ scp .bashrc hduser@node3:~/
       $ scp .bashrc hduser@node2:~/
       $ scp .bashrc hduser@node1:~/
      
       $ scp -r /opt/bigdata/hadoop hduser@node4:/opt/bigdata/hadoop
       $ scp -r /opt/bigdata/hadoop hduser@node3:/opt/bigdata/hadoop
       $ scp -r /opt/bigdata/hadoop hduser@node2:/opt/bigdata/hadoop
       $ scp -r /opt/bigdata/hadoop hduser@node1:/opt/bigdata/hadoop
       $ scp -r /opt/bigdata/hadoop/etc/hadoop hduser@node4:/opt/bigdata/hadoop/etc
       $ scp -r /opt/bigdata/hadoop/etc/hadoop hduser@node3:/opt/bigdata/hadoop/etc
       $ scp -r /opt/bigdata/hadoop/etc/hadoop hduser@node2:/opt/bigdata/hadoop/etc
       $ scp -r /opt/bigdata/hadoop/etc/hadoop hduser@node1:/opt/bigdata/hadoop/etc
      

config and start

  1. config

    1. core-site.xml

       <?xml version="1.0" encoding="UTF-8"?>
       <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
       <configuration>
           <property>
               <name>fs.default.name</name>
               <value>hdfs://node5:9000</value>
           </property>
           <property>
               <name>hadoop.tmp.dir</name>
               <value>file:///app/hadoop/tmp</value>
           </property>
       </configuration>
      
    2. hdfs-site.xml

       <?xml version="1.0" encoding="UTF-8"?>
       <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
       <configuration>
           <property>
               <name>dfs.namenode.checkpoint.period</name>
               <value>3600</value>
           </property>
           <property>
               <name>dfs.namenode.name.dir</name>
               <value>file:///app/hadoop/hdfs/name</value>
               <final>true</final>
           </property>
           <property>
               <name>dfs.datanode.data.dir</name>
               <value>file:///app/hadoop/hdfs/data</value>
               <final>true</final>
           </property>
           <property>
               <name>dfs.blocksize</name>
               <value>134217728</value>
           </property>
           <property>
               <name>dfs.replication</name>
               <value>3</value>
           </property>
           <property>
               <name>dfs.permissions</name>
               <value>false</value>
           </property>
           <property>
               <name>dfs.namenode.handler.count</name>
               <value>50</value>
           </property>
           <property>
               <name>dfs.namenode.checkpoint.dir</name>
               <value>file:///app/hadoop/hdfs/namesecondary</value>
           </property>
       </configuration>
      
    3. yarn-site.xml

       <?xml version="1.0"?>
       <configuration>
           <property>
               <name>yarn.nodemanager.aux-services</name>
               <value>mapreduce_shuffle</value>
           </property>
           <property>
               <name>yarn.nodemanager.aux-services.mapreduce.shuffle.class</name>
               <value>org.apache.hadoop.mapred.ShuffleHandler</value>
           </property>
           <property>
               <name>yarn.resourcemanager.resource-tracker.address</name>
               <value>node5:8025</value>
           </property>
           <property>
               <name>yarn.resourcemanager.scheduler.address</name>
               <value>node5:8030</value>
           </property>
           <property>
               <name>yarn.resourcemanager.address</name>
               <value>node5:8040</value>
           </property>
       </configuration>
      
    4. mapred-site.xml

       <?xml version="1.0"?>
       <configuration>
           <property>
               <name>mapreduce.framework.name</name>
               <value>yarn</value>
           </property>
       </configuration>
      
    5. slaves

       node4
       node3
       node2
       node1
      
  2. format and start

    1. on node5 format hdfs

       $ bin/hdfs namenode -format
      
       # exit safemode
       $ bin/hdfs dfsadmin -report
       Configured Capacity: 5854310649856 (5.32 TB)
       Present Capacity: 5532915458048 (5.03 TB)
       DFS Remaining: 5532915384320 (5.03 TB)
       DFS Used: 73728 (72 KB)
       DFS Used%: 0.00%
       Under replicated blocks: 0
       Blocks with corrupt replicas: 0
       Missing blocks: 0
       Missing blocks (with replication factor 1): 0
      
       -------------------------------------------------
       Live datanodes (3):
      
       Name: 192.168.120.154:50010 (node4)
       Hostname: node4
       Decommission Status : Normal
       Configured Capacity: 1951699709952 (1.78 TB)
       DFS Used: 24576 (24 KB)
       Non DFS Used: 103953088512 (96.81 GB)
       DFS Remaining: 1847746596864 (1.68 TB)
       DFS Used%: 0.00%
       DFS Remaining%: 94.67%
       Configured Cache Capacity: 0 (0 B)
       Cache Used: 0 (0 B)
       Cache Remaining: 0 (0 B)
       Cache Used%: 100.00%
       Cache Remaining%: 0.00%
       Xceivers: 1
       Last contact: Wed Jul 08 08:50:45 CST 2015
      
      
       Name: 192.168.120.152:50010 (node2)
       Hostname: node2
       Decommission Status : Normal
       Configured Capacity: 1951307567104 (1.77 TB)
       DFS Used: 24576 (24 KB)
       Non DFS Used: 104823349248 (97.62 GB)
       DFS Remaining: 1846484193280 (1.68 TB)
       DFS Used%: 0.00%
       DFS Remaining%: 94.63%
       Configured Cache Capacity: 0 (0 B)
       Cache Used: 0 (0 B)
       Cache Remaining: 0 (0 B)
       Cache Used%: 100.00%
       Cache Remaining%: 0.00%
       Xceivers: 1
       Last contact: Wed Jul 08 08:50:45 CST 2015
      
      
       Name: 192.168.120.153:50010 (node3)
       Hostname: node3
       Decommission Status : Normal
       Configured Capacity: 1951303372800 (1.77 TB)
       DFS Used: 24576 (24 KB)
       Non DFS Used: 112618754048 (104.88 GB)
       DFS Remaining: 1838684594176 (1.67 TB)
       DFS Used%: 0.00%
       DFS Remaining%: 94.23%
       Configured Cache Capacity: 0 (0 B)
       Cache Used: 0 (0 B)
       Cache Remaining: 0 (0 B)
       Cache Used%: 100.00%
       Cache Remaining%: 0.00%
       Xceivers: 1
       Last contact: Wed Jul 08 08:50:45 CST 2015
      
    2. on node5 start

       $ sbin/start-dfs.sh
       $ sbin/start-yarn.sh
      
    3. on node5 stop

       $ sbin/stop-yarn.sh
       $ sbin/stop-dfs.sh
      

5. setup zookeeper

  1. topology

             node2         node3         node4
         +-----------+ +-----------+ +-----------+
         |           | |           | |           |
         | ZooKeeper | | ZooKeeper | | ZooKeeper |
         |           | |           | |           |
         +-----------+ +-----------+ +-----------+
    
  2. setup

    1. copy software to node4

       $ scp zookeeper-3.4.6.tar.gz hduser@node4:/opt/bigdata
       $ cd /opt/bigdata
       $ tar -zxf zookeeper-3.4.6.tar.gz
       $ mv zookeeper-3.4.6 zookeeper
      
    2. config zookeeper

       $ pico /opt/bigdata/zookeeper/conf/zoo.cfg
       tickTime=2000
       dataDir=/app/hadoop/zookeeper/data
       clientPort=2181
       initLimit=10
       syncLimit=5
       server.4=node4:20010:20011
       server.3=node3:20010:20011
       server.2=node2:20010:20011
      
    3. copy zookeeper to node3, node2

       $ scp /opt/bigdata/zookeeper hduser@node3:/opt/bigdata
       $ scp /opt/bigdata/zookeeper hduser@node2:/opt/bigdata
      
    4. mkdir for zookeeper data

       # on node2
       $ mkdir -p /app/hadoop/zookeeper/data
       $ chown -R hduser@hadoop /app/hadoop/zookeeper
      
       # on node3
       $ mkdir -p /app/hadoop/zookeeper/data
       $ chown -R hduser@hadoop /app/hadoop/zookeeper
      
       # on node4
       $ mkdir -p /app/hadoop/zookeeper/data
       $ chown -R hduser@hadoop /app/hadoop/zookeeper
      
    5. config node’s myid

       # on node2
       $ pico /app/hadoop/zookeeper/data/myid
       2
      
       # on node3
       $ pico /app/hadoop/zookeeper/data/myid
       3
      
       # on node4
       $ pico /app/hadoop/zookeeper/data/myid
       4
      
  3. start status stop

    1. start zookeeper

       # on node2
       $ /opt/bigdata/zookeeper/bin/zkServer.sh start
      
       # on node3
       $ /opt/bigdata/zookeeper/bin/zkServer.sh start
      
       # on node4
       $ /opt/bigdata/zookeeper/bin/zkServer.sh start
      
    2. check zookeeper status

       # on node2
       $ /opt/bigdata/zookeeper/bin/zkServer.sh status
       JMX enabled by default
       Using config: /opt/bigdata/zookeeper/bin/../conf/zoo.cfg
       Mode: follower
      
       # on node3
       $ /opt/bigdata/zookeeper/bin/zkServer.sh status
       JMX enabled by default
       Using config: /opt/bigdata/zookeeper/bin/../conf/zoo.cfg
       Mode: follower
      
       # on node4
       $ /opt/bigdata/zookeeper/bin/zkServer.sh status
       JMX enabled by default
       Using config: /opt/bigdata/zookeeper/bin/../conf/zoo.cfg
       Mode: leader
      
    3. check zookeeper status

       # use `echo` and `nc`
       $ echo ruok | nc node2 2181
       imok
      
       $ echo status | nc node2 2181
       Zookeeper version: 3.4.6-1569965, built on 02/20/2014 09:09 GMT
       Clients:
        /192.168.120.154:47757[1](queued=0,recved=25564,sent=25564)
        /192.168.120.153:44776[1](queued=0,recved=25565,sent=25565)
        /192.168.120.152:54094[1](queued=0,recved=25565,sent=25565)
        /192.168.120.152:38021[0](queued=0,recved=1,sent=0)
        /192.168.120.152:54089[1](queued=0,recved=25648,sent=25662)
        /192.168.120.155:35662[1](queued=0,recved=25565,sent=25565)
        /192.168.120.153:44772[1](queued=0,recved=25566,sent=25566)
      
       Latency min/avg/max: 0/0/36
       Received: 170608
       Sent: 170657
       Connections: 7
       Outstanding: 0
       Zxid: 0x100000117
       Mode: follower
       Node count: 42
      
       $ echo status | nc node3 2181
       Zookeeper version: 3.4.6-1569965, built on 02/20/2014 09:09 GMT
       Clients:
        /192.168.120.154:39911[1](queued=0,recved=227,sent=227)
        /192.168.120.152:48327[0](queued=0,recved=1,sent=0)
      
       Latency min/avg/max: 0/0/14
       Received: 235
       Sent: 234
       Connections: 2
       Outstanding: 0
       Zxid: 0x100000117
       Mode: follower
       Node count: 42
      
       Zookeeper version: 3.4.6-1569965, built on 02/20/2014 09:09 GMT
       Clients:
        /192.168.120.155:53241[1](queued=0,recved=29290,sent=29290)
        /192.168.120.153:40275[1](queued=0,recved=25644,sent=25652)
        /192.168.120.152:54308[1](queued=0,recved=25570,sent=25570)
        /192.168.120.155:53240[1](queued=0,recved=25568,sent=25568)
        /192.168.120.154:48195[1](queued=0,recved=25625,sent=25634)
        /192.168.120.152:38243[0](queued=0,recved=1,sent=0)
        /192.168.120.155:53234[1](queued=0,recved=31166,sent=31203)
        /192.168.120.154:48202[1](queued=0,recved=25568,sent=25568)
      
       Latency min/avg/max: 0/0/110
       Received: 218610
       Sent: 218675
       Connections: 8
       Outstanding: 0
       Zxid: 0x100000117
       Mode: leader
       Node count: 42
      
    4. stop zookeeper

       # on node2
       $ /opt/bigdata/zookeeper/bin/zkServer.sh stop
      
       # on node3
       $ /opt/bigdata/zookeeper/bin/zkServer.sh stop
      
       # on node4
       $ /opt/bigdata/zookeeper/bin/zkServer.sh stop
      

6. setup hbase

  1. topology

                               node5
                          +--------------+
                          |              |
                          |    Master    |
                          |              |
                          +--------------+
                         /       |        \
                        /        |         \
                       /         |          \
                      /          |           \
                     /           |            \
                    /            |             \
               node2            node3           node4
         +--------------+ +--------------+ +--------------+
         |              | |              | |              |
         | ZooKeeper    | | ZooKeeper    | | ZooKeeper    |
         | RegionServer | | RegionServer | | RegionServer |
         |              | |              | |              |
         +--------------+ +--------------+ +--------------+
    
  2. setup

    1. copy software to node5

       $ scp hbase-1.0.1.1-bin.tar.gz hduser@node4:/opt/bigdata
       $ cd /opt/bigdata
       $ tar -zxf hbase-1.0.1.1-bin.tar.gz
       $ mv hbase-1.0.1.1 hbase
      
    2. config hbase

       $ pico /opt/bigdata/hbase/conf/regionservers
       node4
       node3
       node2
      
       $ pico /opt/bigdata/hbase/conf/backup-masters
       node2
      
       $ pico /opt/bigdata/hbase/conf/hbase-site.xml
       <?xml version="1.0"?>
       <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
       <configuration>
           <property>
               <name>hbase.zookeeper.quorum</name>
               <value>node4,node3,node2</value>
           </property>
           <property>
               <name>hbase.rootdir</name>
               <value>hdfs://node5:9000/hbase</value>
           </property> 
           <property>
               <name>hbase.cluster.distributed</name>
               <value>true</value>
           </property>
       </configuration>
      
    3. copy hbase to node4, node3, node2

       $ scp /opt/bigdata/hbase hduser@node4:/opt/bigdata
       $ scp /opt/bigdata/hbase hduser@node3:/opt/bigdata
       $ scp /opt/bigdata/hbase hduser@node2:/opt/bigdata
      
  3. start stop

    1. start

       # on node5
       $ /opt/bigdata/hbase/bin/start-hbase.sh
      
    2. stop

       # on node5
       $ /opt/bigdata/hbase/bin/stop-hbase.sh
      

7. setup sqoop

  1. topology

               node2            node3    
         +--------------+ +--------------+
         |              | |              |
         | sqoop server | | sqoop server |
         |              | |              |
         +--------------+ +--------------+
    
  2. setup

    1. copy software to node2

       $ scp sqoop-1.99.6-bin-hadoop200.tar.gz hduser@node2:/opt/bigdata
       $ cd /opt/bigdata
       $ tar -zxf sqoop-1.99.6-bin-hadoop200.tar.gz
       $ mv sqoop-1.99.6-bin-hadoop200 sqoop
      
    2. config sqoop

       # sqoop/server/conf/catalina.properties
       # hadoop's common    and common's    lib
       # hadoop's hdfs      and hdfs's      lib
       # hadoop's mapreduce and mapreduce's lib
       # hadoop's yarn      and yarn's      lib
       $ sed 's@/usr/lib/hadoop/\*.jar@/opt/bigdata/hadoop/share/hadoop/common/\*.jar@' -i /opt/bigdata/sqoop/server/conf/catalina.properties
       $ sed 's@/usr/lib/hadoop/lib/\*.jar@/opt/bigdata/hadoop/share/hadoop/common/lib/\*.jar@' -i /opt/bigdata/sqoop/server/conf/catalina.properties
       $ sed 's@/usr/lib/hadoop-hdfs/\*.jar@/opt/bigdata/hadoop/share/hadoop/hdfs/\*.jar@' -i /opt/bigdata/sqoop/server/conf/catalina.properties
       $ sed 's@/usr/lib/hadoop-hdfs/lib/\*.jar@/opt/bigdata/hadoop/share/hadoop/hdfs/lib/\*.jar@' -i /opt/bigdata/sqoop/server/conf/catalina.properties
       $ sed 's@/usr/lib/hadoop-mapreduce/\*.jar@/opt/bigdata/hadoop/share/hadoop/mapreduce/\*.jar@' -i /opt/bigdata/sqoop/server/conf/catalina.properties
       $ sed 's@/usr/lib/hadoop-mapreduce/lib/\*.jar@/opt/bigdata/hadoop/share/hadoop/mapreduce/lib/\*.jar@' -i /opt/bigdata/sqoop/server/conf/catalina.properties
       $ sed 's@/usr/lib/hadoop-yarn/\*.jar@/opt/bigdata/hadoop/share/hadoop/yarn/\*.jar@' -i /opt/bigdata/sqoop/server/conf/catalina.properties
       $ sed 's@/usr/lib/hadoop-yarn/lib/\*.jar@/opt/bigdata/hadoop/share/hadoop/yarn/lib/\*.jar@' -i /opt/bigdata/sqoop/server/conf/catalina.properties
              
       # sqoop/server/conf/sqoop.properties
       # hadoop's conf dir
       $ sed 's@/etc/hadoop/conf/@/opt/bigdata/hadoop/etc/hadoop/@' -i /opt/bigdata/sqoop/server/conf/sqoop.properties
      
    3. verify

       $ /opt/bigdata/sqoop/bin/sqoop2-tool verify
       ...
       Verification was successful.
       Tool class org.apache.sqoop.tools.tool.VerifyTool has finished correctly
      
  3. start stop sqoop server

    1. start

       $ /opt/bigdata/sqoop/bin/sqoop2-server start
      
       Sqoop home directory: /opt/bigdata/sqoop
       Setting SQOOP_HTTP_PORT:     12000
       Setting SQOOP_ADMIN_PORT:     12001
       Using   CATALINA_OPTS:       
       Adding to CATALINA_OPTS:    -Dsqoop.http.port=12000 -Dsqoop.admin.port=12001
       Using CATALINA_BASE:   /opt/bigdata/sqoop/server
       Using CATALINA_HOME:   /opt/bigdata/sqoop/server
       Using CATALINA_TMPDIR: /opt/bigdata/sqoop/server/temp
       Using JRE_HOME:        /usr/lib/jvm/java-8-openjdk-amd64/jre
       Using CLASSPATH:       /opt/bigdata/sqoop/server/bin/bootstrap.jar
      
    2. stop

       $ /opt/bigdata/sqoop/bin/sqoop2-server stop
      
  4. sqoop2-shell

    1. start

       $ /opt/bigdata/sqoop/bin/sqoop2-shell
      
    2. set server

       sqoop:000> set server --host node2
      
    3. version

       # 1. version
       sqoop:000> show version      
       client version:
         Sqoop 1.99.6 source revision 07244c3915975f26f03d9e1edf09ab7d06619bb8 
         Compiled by root on Wed Apr 29 10:40:43 CST 2015
       sqoop:000> show version --all  
       client version:
         Sqoop 1.99.6 source revision 07244c3915975f26f03d9e1edf09ab7d06619bb8 
         Compiled by root on Wed Apr 29 10:40:43 CST 2015
       server version:
         Sqoop 1.99.6 source revision 07244c3915975f26f03d9e1edf09ab7d06619bb8 
         Compiled by root on Wed Apr 29 10:40:43 CST 2015
       API versions:
         [v1]
      
    4. connector

       sqoop:000> show connector      
       +----+------------------------+---------+------------------------------------------------------+----------------------+
       | Id |          Name          | Version |                        Class                         | Supported Directions |
       +----+------------------------+---------+------------------------------------------------------+----------------------+
       | 1  | generic-jdbc-connector | 1.99.6  | org.apache.sqoop.connector.jdbc.GenericJdbcConnector | FROM/TO              |
       | 2  | kite-connector         | 1.99.6  | org.apache.sqoop.connector.kite.KiteConnector        | FROM/TO              |
       | 3  | hdfs-connector         | 1.99.6  | org.apache.sqoop.connector.hdfs.HdfsConnector        | FROM/TO              |
       | 4  | kafka-connector        | 1.99.6  | org.apache.sqoop.connector.kafka.KafkaConnector      | TO                   |
       +----+------------------------+---------+------------------------------------------------------+----------------------+
      
    5. link - hdfs

       sqoop:000> create link -c 3
       Creating link for connector with id 3
       Please fill following values to create new link object
       Name: hdfs
      
       Link configuration
      
       HDFS URI: hdfs://node5:9000
       Hadoop conf directory: /opt/bigdata/hadoop/etc/hadoop 
       New link was successfully created with validation status OK and persistent id 1
       sqoop:000> show link
       +----+------+--------------+----------------+---------+
       | Id | Name | Connector Id | Connector Name | Enabled |
       +----+------+--------------+----------------+---------+
       | 1  | hdfs | 3            | hdfs-connector | true    |
       +----+------+--------------+----------------+---------+
      
    6. link - mysql

       sqoop:000> create link -c 1
       Creating link for connector with id 1
       Please fill following values to create new link object
       Name: mysql
      
       Link configuration
      
       JDBC Driver Class: com.mysql.jdbc.Driver
       JDBC Connection String: jdbc:mysql://node3/sqoop
       Username: sa
       Password: **
       JDBC Connection Properties: 
       There are currently 0 values in the map:
       entry# protocol=tcp
       There are currently 1 values in the map:
       protocol = tcp
       entry# 
      
       There were warnings while create or update, but saved successfully.
       Warning message: Can't connect to the database with given credentials: Communications link failure
      
       The last packet sent successfully to the server was 0 milliseconds ago. The driver has not received any packets from the server. 
       New link was successfully created with validation status WARNING and persistent id 2
       sqoop:000> show link
       +----+-------+--------------+------------------------+---------+
       | Id | Name  | Connector Id |     Connector Name     | Enabled |
       +----+-------+--------------+------------------------+---------+
       | 1  | hdfs  | 3            | hdfs-connector         | true    |
       | 2  | mysql | 1            | generic-jdbc-connector | true    |
       +----+-------+--------------+------------------------+---------+
      
    7. link - sqlserver

       sqoop:000> create link -c 1
       Creating link for connector with id 1
       Please fill following values to create new link object
       Name: mssql
      
       Link configuration
      
       JDBC Driver Class: com.microsoft.sqlserver.jdbc.SQLServerDriver
       JDBC Connection String: jdbc:sqlserver://192.168.120.151
       Username: sa
       Password: **
       JDBC Connection Properties: 
       There are currently 0 values in the map:
       entry# protocol=tcp 
       There are currently 1 values in the map:
       protocol = tcp
       entry# 
       New link was successfully created with validation status OK and persistent id 3
       sqoop:000> show link
       +----+-------+--------------+------------------------+---------+
       | Id | Name  | Connector Id |     Connector Name     | Enabled |
       +----+-------+--------------+------------------------+---------+
       | 1  | hdfs  | 3            | hdfs-connector         | true    |
       | 2  | mysql | 1            | generic-jdbc-connector | true    |
       | 3  | mssql | 1            | generic-jdbc-connector | true    |
       +----+-------+--------------+------------------------+---------+
      
    8. create sqoop job to ingest data from mysql to hdfs

       sqoop:000> create job --from 2 --to 1
       Creating job for links with from id 2 and to id 1
       Please fill following values to create new job object
       Name: mysql_to_hdfs
      
       From database configuration
      
       Schema name: sqoop
       Table name: employee
       Table SQL statement: 
       Table column names: 
       Partition column name: 
       Null value allowed for the partition column: 
       Boundary query: 
      
       Incremental read
      
       Check column: 
       Last value: 
      
       To HDFS configuration
      
       Override null value: 
       Null value: 
       Output format: 
         0 : TEXT_FILE
         1 : SEQUENCE_FILE
       Choose: 0
       Compression format: 
         0 : NONE
         1 : DEFAULT
         2 : DEFLATE
         3 : GZIP
         4 : BZIP2
         5 : LZO
         6 : LZ4
         7 : SNAPPY
         8 : CUSTOM
       Choose: 0
       Custom compression format: 
       Output directory: /user/hduser
       Append mode: 
      
       Throttling resources
      
       Extractors: 
       Loaders: 
       New job was successfully created with validation status OK  and persistent id 1
       sqoop:000> show job
       +----+---------------+----------------+--------------+---------+
       | Id |     Name      | From Connector | To Connector | Enabled |
       +----+---------------+----------------+--------------+---------+
       | 1  | mysql_to_hdfs | 1              | 3            | true    |
       +----+---------------+----------------+--------------+---------+
      
    9. start job and check status

       sqoop:000> start job -j 1 
       Submission details
       Job ID: 1
       Server URL: http://node2:12000/sqoop/
       Created by: hduser
       Creation date: 2015-07-09 17:26:59 CST
       Lastly updated by: hduser
       External ID: job_1436433861773_0001
           http://node5:8088/proxy/application_1436433861773_0001/
       2015-07-09 17:26:59 CST: BOOTING  - Progress is not available
      
       sqoop:000> status job -j 1
      

8. setup spark

  1. topology

                               node5
                          +--------------+
                          |              |
                          | spark master |
                          |              |
                          +--------------+
                         /       |        \
                        /        |         \
                       /         |          \
                      /          |           \
                     /           |            \
                    /            |             \
               node2            node3           node4
         +--------------+ +--------------+ +--------------+
         |              | |              | |              |
         | spark worker | | spark worker | | spark worker |
         |              | |              | |              |
         +--------------+ +--------------+ +--------------+
    
  2. setup

    1. copy software to node5

       $ scp spark-1.3.1-bin-hadoop2.6.tgz hduser@node5:/opt/bigdata
       $ cd /opt/bigdata
       $ tar -zxf spark-1.3.1-bin-hadoop2.6.tgz
       $ mv spark-1.3.1-bin-hadoop2.6 spark
      
       $ scp -r /opt/bigdata/spark hduser@node4:/opt/bigdata/spark
       $ scp -r /opt/bigdata/spark hduser@node3:/opt/bigdata/spark
       $ scp -r /opt/bigdata/spark hduser@node2:/opt/bigdata/spark
      
    2. start spark master on node5

       $ /opt/bigdata/spark/sbin/start-master.sh
      
    3. start spark worker on node4 node3 node2

       # on node4
       $ /opt/bigdata/spark/bin/spark-class org.apache.spark.deploy.worker.Worker spark://node5:7077
      
       # on node3
       $ /opt/bigdata/spark/bin/spark-class org.apache.spark.deploy.worker.Worker spark://node5:7077
      
       # on node2
       $ /opt/bigdata/spark/bin/spark-class org.apache.spark.deploy.worker.Worker spark://node5:7077
      
    4. demo

       $ /opt/bigdata/spark/bin/spark-submit \
       --class org.apache.spark.examples.SparkPi \
       --master yarn-cluster \
       --num-executors 3 \
       --driver-memory 4g \
       --executor-memory 2g \
       --executor-cores 1 \
       --queue thequeue \
       /opt/bigdata/spark/lib/spark-examples*.jar \
       10
      

fixed

  1. problem connecting to server

    1. log on node3

       2015-07-08 08:20:39,044 WARN org.apache.hadoop.hdfs.server.datanode.DataNode: Problem connecting to server: node5/192.168.120.155:9000
      
       2015-07-08 08:20:48,046 INFO org.apache.hadoop.ipc.Client: Retrying connect to server: node5/192.168.120.155:9000. Already tried 3 time(s); retry policy is RetryUpToMaximumCountWithFixedSleep(maxRetries=10, sleepTime=1000 MILLISECONDS)
      
    2. datanode can’t talk to namenode

    3. hdfs service failed to start, issues with datanode and namenode

       Harsh J
      
       Sorry, should've sent you a direct string… I mean something like this: 
      
       127.0.0.1 localhost johniv-able 
       # 127.0.1.1 johniv-able 
       # ^^^^^^^^^^^^^^^^^^^^^
      
  2. ERROR 2003 (HY000): Can’t connect to MySQL server on ‘node3’ (61)

    1. ubuntuforums

    2. stackoverflow error-2003

    3. stackoverflow error-61

       # 1. check status
       $ netstat -tulpen
       (Not all processes could be identified, non-owned process info
        will not be shown, you would have to be root to see it all.)
       Active Internet connections (only servers)
       Proto Recv-Q Send-Q Local Address           Foreign Address         State       User       Inode       PID/Program name
       tcp        0      0 0.0.0.0:22              0.0.0.0:*               LISTEN      0          20360       -               
       tcp        0      0 0.0.0.0:3306            0.0.0.0:*               LISTEN      121        31073       -               
       tcp        0      0 0.0.0.0:50010           0.0.0.0:*               LISTEN      1001       29143       4193/java       
       tcp        0      0 0.0.0.0:50020           0.0.0.0:*               LISTEN      1001       33985       4193/java       
       tcp        0      0 0.0.0.0:50075           0.0.0.0:*               LISTEN      1001       29188       4193/java       
       tcp        0      0 127.0.0.1:48933         0.0.0.0:*               LISTEN      1001       29149       4193/java       
       tcp        0      0 127.0.0.1:5939          0.0.0.0:*               LISTEN      0          18701       -               
       tcp        0      0 127.0.0.1:631           0.0.0.0:*               LISTEN      0          120296      -               
       tcp6       0      0 192.168.120.153:16020   :::*                    LISTEN      1001       33404       4416/java       
       tcp6       0      0 ::1:631                 :::*                    LISTEN      0          120295      -               
       tcp6       0      0 :::13562                :::*                    LISTEN      1001       33608       4840/java       
       tcp6       0      0 :::16030                :::*                    LISTEN      1001       33425       4416/java       
       tcp6       0      0 :::22                   :::*                    LISTEN      0          20362       -               
       tcp6       0      0 :::52600                :::*                    LISTEN      1001       41017       4840/java       
       tcp6       0      0 :::8040                 :::*                    LISTEN      1001       33604       4840/java       
       tcp6       0      0 :::8042                 :::*                    LISTEN      1001       33609       4840/java       
       udp        0      0 0.0.0.0:41761           0.0.0.0:*                           107        16926       -               
       udp        0      0 0.0.0.0:5353            0.0.0.0:*                           107        16924       -               
       udp        0      0 0.0.0.0:631             0.0.0.0:*                           0          18693       -               
       udp6       0      0 :::52756                :::*                                107        16927       -               
       udp6       0      0 :::5353                 :::*                                107        16925       -               
      
       # 2. refer edit `my.cnf`
      
    4. edit my.cnf

       $ sudo pico /etc/alternatives/my.cnf
       or
       $ sudo pico /etc/mysql/my.cnf
       # You can copy this to one of:
       # - "/etc/mysql/my.cnf" to set global options,
       # - "~/.my.cnf" to set user-specific options.
      
       [mysqld]
       bind-address = 0.0.0.0
      
    5. grant privileges

       mysql> grant all privileges on *.* to 'sa'@'%' identified by 'sa';
      
    6. restart mysql service

       $ /etc/init.d/mysql restart
       or
       $ service mysql restart
      
  3. Connection refused

    1. java.net.ConnectException

       sqoop:000> start job -j 1
       2015-07-09 16:30:32 CST: FAILURE_ON_SUBMIT 
       Exception: java.net.ConnectException: Call From node2/192.168.120.152 to node5:8040 failed on connection exception: java.net.ConnectException: Connection refused; For more details see:  http://wiki.apache.org/hadoop/ConnectionRefused
      
    2. ConnectionRefused

    3. fixed

       # yarn not started
       # on node5
       $ cd /opt/bigdata/hadoop/etc/hadoop
       $ grep 8040 ./*
       ./yarn-site.xml:        <value>node5:8040</value>
      
       # so start yarn
       $ /opt/bigdata/hadoop/sbin/start-yarn.sh
      
  4. Class not found

    1. Error message: Class not found

       sqoop:000> create link -c 1
       Creating link for connector with id 1
       Please fill following values to create new link object
       Name: mssql
      
       Link configuration
      
       JDBC Driver Class: com.microsoft.jdbc.sqlserver 
       JDBC Connection String: jdbc:sqlserver://192.168.120.151
       Username: sa
       Password: **
       JDBC Connection Properties: 
       There are currently 0 values in the map:
       entry# protocol=tcp
       There are currently 1 values in the map:
       protocol = tcp
       entry# 
      
        There are issues with entered data, please revise your input:
       Name: mssql       
      
       Link configuration
      
       Error message: Class not found 
       JDBC Driver Class: com.microsoft.jdbc.sqlserver.SQLServerDriver
      
    2. fixed

       # 1. find  `sqljdbc4.jar`
       # 2. unzip `sqljdbc4.jar`
       # 3. check path
       # so sqlserver's driver class is
       com.microsoft.sqlserver.jdbc.SQLServerDriver
      


blog comments powered by Disqus