Categories: Uncategorized

[AmazonEC2][Hadoop]Complete Distribution Mode



Instance : m1.large
Number of instances : 3

Stand Alone Mode

$sudo apt-get update
$sudo apt-get install default-jdk
$sudo adduser hadoop
$sudo su - hadoop

[hadoop user]
$tar zxvf hadoop-1.1.2.tar.gz

$sudo mv /home/hadoop/hadoop-1.1.2 /usr/local/hadoop
$sudo chown hadoop:hadoop -R /usr/local/hadoop
$sudo su - hadoop

[hadoop user]
$vim .bashrc
export JAVA_HOME=/usr
export HADOOP_INSTALL=/usr/local/hadoop
$source .bashrc
$hadoop version
Hadoop 1.1.2
Subversion -r 1440782
Compiled by hortonfo on Thu Jan 31 02:03:24 UTC 2013
From source with checksum c720ddcf4b926991de7467d253a79b8b
$mkdir hadoop-job
$cd hadoop-job
$mkdir input
$vim input/a
a b c
$vim input/b
a a b c c c
$hadoop jar /usr/local/hadoop/hadoop-examples-1.1.2.jar wordcount input output
13/05/27 15:16:16 INFO util.NativeCodeLoader: Loaded the native-hadoop library
13/05/27 15:16:16 INFO input.FileInputFormat: Total input paths to process : 1
13/05/27 15:16:16 WARN snappy.LoadSnappy: Snappy native library not loaded
13/05/27 15:16:16 INFO mapred.JobClient: Running job: job_local_0001
13/05/27 15:16:16 INFO util.ProcessTree: setsid exited with exit code 0
13/05/27 15:16:16 INFO mapred.Task:  Using ResourceCalculatorPlugin : org.apache.hadoop.util.LinuxResourceCalculatorPlugin@3d3cdaa
13/05/27 15:16:16 INFO mapred.MapTask: io.sort.mb = 100
13/05/27 15:16:16 INFO mapred.MapTask: data buffer = 79691776/99614720
13/05/27 15:16:16 INFO mapred.MapTask: record buffer = 262144/327680
13/05/27 15:16:16 INFO mapred.MapTask: Starting flush of map output
13/05/27 15:16:17 INFO mapred.MapTask: Finished spill 0
13/05/27 15:16:17 INFO mapred.Task: Task:attempt_local_0001_m_000000_0 is done. And is in the process of commiting
13/05/27 15:16:17 INFO mapred.LocalJobRunner:
13/05/27 15:16:17 INFO mapred.Task: Task 'attempt_local_0001_m_000000_0' done.
13/05/27 15:16:17 INFO mapred.Task:  Using ResourceCalculatorPlugin : org.apache.hadoop.util.LinuxResourceCalculatorPlugin@670fe2b9
13/05/27 15:16:17 INFO mapred.LocalJobRunner:
13/05/27 15:16:17 INFO mapred.Merger: Merging 1 sorted segments
13/05/27 15:16:17 INFO mapred.Merger: Down to the last merge-pass, with 1 segments left of total size: 26 bytes
13/05/27 15:16:17 INFO mapred.LocalJobRunner:
13/05/27 15:16:17 INFO mapred.Task: Task:attempt_local_0001_r_000000_0 is done. And is in the process of commiting
13/05/27 15:16:17 INFO mapred.LocalJobRunner:
13/05/27 15:16:17 INFO mapred.Task: Task attempt_local_0001_r_000000_0 is allowed to commit now
13/05/27 15:16:17 INFO output.FileOutputCommitter: Saved output of task 'attempt_local_0001_r_000000_0' to output
13/05/27 15:16:17 INFO mapred.LocalJobRunner: reduce > reduce
13/05/27 15:16:17 INFO mapred.Task: Task 'attempt_local_0001_r_000000_0' done.
13/05/27 15:16:17 INFO mapred.JobClient:  map 100% reduce 100%
13/05/27 15:16:17 INFO mapred.JobClient: Job complete: job_local_0001
13/05/27 15:16:17 INFO mapred.JobClient: Counters: 20
13/05/27 15:16:17 INFO mapred.JobClient:   File Output Format Counters
13/05/27 15:16:17 INFO mapred.JobClient:     Bytes Written=24
13/05/27 15:16:17 INFO mapred.JobClient:   FileSystemCounters
13/05/27 15:16:17 INFO mapred.JobClient:     FILE_BYTES_READ=285258
13/05/27 15:16:17 INFO mapred.JobClient:     FILE_BYTES_WRITTEN=384918
13/05/27 15:16:17 INFO mapred.JobClient:   File Input Format Counters
13/05/27 15:16:17 INFO mapred.JobClient:     Bytes Read=6
13/05/27 15:16:17 INFO mapred.JobClient:   Map-Reduce Framework
13/05/27 15:16:17 INFO mapred.JobClient:     Map output materialized bytes=30
13/05/27 15:16:17 INFO mapred.JobClient:     Map input records=1
13/05/27 15:16:17 INFO mapred.JobClient:     Reduce shuffle bytes=0
13/05/27 15:16:17 INFO mapred.JobClient:     Spilled Records=6
13/05/27 15:16:17 INFO mapred.JobClient:     Map output bytes=18
13/05/27 15:16:17 INFO mapred.JobClient:     Total committed heap usage (bytes)=433455104
13/05/27 15:16:17 INFO mapred.JobClient:     CPU time spent (ms)=0
13/05/27 15:16:17 INFO mapred.JobClient:     SPLIT_RAW_BYTES=101
13/05/27 15:16:17 INFO mapred.JobClient:     Combine input records=3
13/05/27 15:16:17 INFO mapred.JobClient:     Reduce input records=3
13/05/27 15:16:17 INFO mapred.JobClient:     Reduce input groups=3
13/05/27 15:16:17 INFO mapred.JobClient:     Combine output records=3
13/05/27 15:16:17 INFO mapred.JobClient:     Physical memory (bytes) snapshot=0
13/05/27 15:16:17 INFO mapred.JobClient:     Reduce output records=3
13/05/27 15:16:17 INFO mapred.JobClient:     Virtual memory (bytes) snapshot=0
13/05/27 15:16:17 INFO mapred.JobClient:     Map output records=3
input output
$ls output/
part-r-00000  _SUCCESS
$less output/part-r-00000
a       1
b       1
c       1

Pseudo Distribution Mode

[hadoop user]
$ssh-keygen -t dsa
Empty passphrase
$cat ~/.ssh/ >> ~/.ssh/authorized_keys
$chmod 600 ~/.ssh/authorized_keys
$ssh localhost

$vim /usr/local/hadoop/conf/core-site.xml
$vim /usr/local/hadoop/conf/mapred-site.xml
$vim /usr/local/hadoop/conf/hdfs-site.xml

$vim /usr/local/hadoop/conf/
# The java implementation to use.  Required.
# export JAVA_HOME=/usr/lib/j2sdk1.5-sun
export JAVA_HOME=/usr

# Extra Java CLASSPATH elements.  Optional.

# The maximum amount of heap to use, in MB. Default is 1000.
# export HADOOP_HEAPSIZE=2000

$hadoop namenode -format
13/05/27 15:45:54 INFO namenode.NameNode: STARTUP_MSG:
STARTUP_MSG: Starting NameNode
STARTUP_MSG:   host = ip-10-254-0-175/
STARTUP_MSG:   args = [-format]
STARTUP_MSG:   version = 1.1.2
STARTUP_MSG:   build = -r 1440782; compiled by 'hortonfo' on Thu Jan 31 02:03:24 UTC 2013
Re-format filesystem in /tmp/hadoop-hadoop/dfs/name ? (Y or N) Y
13/05/27 15:45:57 INFO util.GSet: VM type       = 64-bit
13/05/27 15:45:57 INFO util.GSet: 2% max memory = 17.77875 MB
13/05/27 15:45:57 INFO util.GSet: capacity      = 2^21 = 2097152 entries
13/05/27 15:45:57 INFO util.GSet: recommended=2097152, actual=2097152
13/05/27 15:45:57 INFO namenode.FSNamesystem: fsOwner=hadoop
13/05/27 15:45:57 INFO namenode.FSNamesystem: supergroup=supergroup
13/05/27 15:45:57 INFO namenode.FSNamesystem: isPermissionEnabled=true
13/05/27 15:45:57 INFO namenode.FSNamesystem: dfs.block.invalidate.limit=100
13/05/27 15:45:57 INFO namenode.FSNamesystem: isAccessTokenEnabled=false accessKeyUpdateInterval=0 min(s), accessTokenLifetime=0 min(s)
13/05/27 15:45:57 INFO namenode.NameNode: Caching file names occuring more than 10 times
13/05/27 15:45:57 INFO common.Storage: Image file of size 112 saved in 0 seconds.
13/05/27 15:45:58 INFO namenode.FSEditLog: closing edit log: position=4, editlog=/tmp/hadoop-hadoop/dfs/name/current/edits
13/05/27 15:45:58 INFO namenode.FSEditLog: close success: truncate to 4, editlog=/tmp/hadoop-hadoop/dfs/name/current/edits
13/05/27 15:45:58 INFO common.Storage: Storage directory /tmp/hadoop-hadoop/dfs/name has been successfully formatted.
13/05/27 15:45:58 INFO namenode.NameNode: SHUTDOWN_MSG:
SHUTDOWN_MSG: Shutting down NameNode at ip-10-254-0-175/

namenode running as process 27482. Stop it first.
localhost: starting datanode, logging to /usr/local/hadoop/libexec/../logs/hadoop-hadoop-datanode-ip-10-254-0-175.out
localhost: secondarynamenode running as process 27907. Stop it first.
jobtracker running as process 18517. Stop it first.
localhost: starting tasktracker, logging to /usr/local/hadoop/libexec/../logs/hadoop-hadoop-tasktracker-ip-10-254-0-175.out

28237 NodeManager
18517 JobTracker
28057 ResourceManager
20081 Jps
20004 TaskTracker
27482 NameNode
27907 SecondaryNameNode
$hadoop fs -ls /
Found 1 items
drwxr-xr-x   - hadoop supergroup          0 2013-05-27 16:01 /tmp

$cd hadoop-job
$hadoop fs -put input input
$hadoop fs -ls input
Found 1 items
-rw-r--r--   1 hadoop supergroup          6 2013-05-27 16:02 /user/hadoop/input/a
$hadoop jar /usr/local/hadoop/hadoop-examples-1.1.2.jar wordcount input output
13/05/27 16:03:40 INFO input.FileInputFormat: Total input paths to process : 1
13/05/27 16:03:40 INFO util.NativeCodeLoader: Loaded the native-hadoop library
13/05/27 16:03:40 WARN snappy.LoadSnappy: Snappy native library not loaded
13/05/27 16:03:41 INFO mapred.JobClient: Running job: job_201305271601_0001
13/05/27 16:03:42 INFO mapred.JobClient:  map 0% reduce 0%
13/05/27 16:03:48 INFO mapred.JobClient:  map 100% reduce 0%
13/05/27 16:03:56 INFO mapred.JobClient:  map 100% reduce 33%
13/05/27 16:03:57 INFO mapred.JobClient:  map 100% reduce 100%
13/05/27 16:03:58 INFO mapred.JobClient: Job complete: job_201305271601_0001
13/05/27 16:03:58 INFO mapred.JobClient: Counters: 29
13/05/27 16:03:58 INFO mapred.JobClient:   Job Counters
13/05/27 16:03:58 INFO mapred.JobClient:     Launched reduce tasks=1
13/05/27 16:03:58 INFO mapred.JobClient:     SLOTS_MILLIS_MAPS=5885
13/05/27 16:03:58 INFO mapred.JobClient:     Total time spent by all reduces waiting after reserving slots (ms)=0
13/05/27 16:03:58 INFO mapred.JobClient:     Total time spent by all maps waiting after reserving slots (ms)=0
13/05/27 16:03:58 INFO mapred.JobClient:     Launched map tasks=1
13/05/27 16:03:58 INFO mapred.JobClient:     Data-local map tasks=1
13/05/27 16:03:58 INFO mapred.JobClient:     SLOTS_MILLIS_REDUCES=8987
13/05/27 16:03:58 INFO mapred.JobClient:   File Output Format Counters
13/05/27 16:03:58 INFO mapred.JobClient:     Bytes Written=12
13/05/27 16:03:58 INFO mapred.JobClient:   FileSystemCounters
13/05/27 16:03:58 INFO mapred.JobClient:     FILE_BYTES_READ=30
13/05/27 16:03:58 INFO mapred.JobClient:     HDFS_BYTES_READ=112
13/05/27 16:03:58 INFO mapred.JobClient:     FILE_BYTES_WRITTEN=103913
13/05/27 16:03:58 INFO mapred.JobClient:     HDFS_BYTES_WRITTEN=12
13/05/27 16:03:58 INFO mapred.JobClient:   File Input Format Counters
13/05/27 16:03:58 INFO mapred.JobClient:     Bytes Read=6
13/05/27 16:03:58 INFO mapred.JobClient:   Map-Reduce Framework
13/05/27 16:03:58 INFO mapred.JobClient:     Map output materialized bytes=30
13/05/27 16:03:58 INFO mapred.JobClient:     Map input records=1
13/05/27 16:03:58 INFO mapred.JobClient:     Reduce shuffle bytes=30
13/05/27 16:03:58 INFO mapred.JobClient:     Spilled Records=6
13/05/27 16:03:58 INFO mapred.JobClient:     Map output bytes=18
13/05/27 16:03:58 INFO mapred.JobClient:     CPU time spent (ms)=830
13/05/27 16:03:58 INFO mapred.JobClient:     Total committed heap usage (bytes)=292552704
13/05/27 16:03:58 INFO mapred.JobClient:     Combine input records=3
13/05/27 16:03:58 INFO mapred.JobClient:     SPLIT_RAW_BYTES=106
13/05/27 16:03:58 INFO mapred.JobClient:     Reduce input records=3
13/05/27 16:03:58 INFO mapred.JobClient:     Reduce input groups=3
13/05/27 16:03:58 INFO mapred.JobClient:     Combine output records=3
13/05/27 16:03:58 INFO mapred.JobClient:     Physical memory (bytes) snapshot=290643968
13/05/27 16:03:58 INFO mapred.JobClient:     Reduce output records=3
13/05/27 16:03:58 INFO mapred.JobClient:     Virtual memory (bytes) snapshot=2163224576
13/05/27 16:03:58 INFO mapred.JobClient:     Map output records=3

$hadoop fs -ls output
Found 3 items
-rw-r--r--   1 hadoop supergroup          0 2013-05-27 16:03 /user/hadoop/output/_SUCCESS
drwxr-xr-x   - hadoop supergroup          0 2013-05-27 16:03 /user/hadoop/output/_logs
-rw-r--r--   1 hadoop supergroup         12 2013-05-27 16:03 /user/hadoop/output/part-r-00000
$hadoop fs -cat output/part-r-00000
a       3
b       2
c       4

Complete Distribution Mode

Key Change

[default user@hadoop1]
$sudo cp /home/hadoop/.ssh/ .
$sudo chmod 644

Copy file to local with SCP client
Copy file to hadoop2 server with SCP client
Copy file to hadoop3 server with SCP client

[default user@hadoop2]
$sudo adduser hadoop
$sudo su - hadoop
$mkdir .ssh
[default user@hadoop2]
$sudo mv /home/hadoop/.ssh/
$sudo su - hadoop

$cat ~/.ssh/ >> ~/.ssh/authorized_keys
$chmod 600 ~/.ssh/authorized_keys

$ssh -i ~/.ssh/id_dsa <instans-name>
$ssh <instans-name>

[default user@hadoop3]
$sudo adduser hadoop
$sudo su - hadoop
$mkdir .ssh
[default user@hadoop2]
$sudo mv /home/hadoop/.ssh/
$sudo su - hadoop

$cat ~/.ssh/ >> ~/.ssh/authorized_keys
$chmod 600 ~/.ssh/authorized_keys

$ssh -i ~/.ssh/id_dsa <instans-name>
$ssh <instans-name>

Do same thing for hadoop2, hadoop3


Install hadoop at hadoop2, hadoop3.

$vim /usr/local/hadoop/conf/masters
<hadoop1 host name>
$vim /usr/local/hadoop/conf/slaves
<hadoop1 host name>
<hadoop2 host name>
<hadoop3 host name>
$vim /usr/local/hadoop/conf/core-site.xml
    <value>hdfs://<hadoop1 host name>:9010</value>
$vim /usr/local/hadoop/conf/mapred-site.xml
    <value><hadoop1 host name>:9011</value>
$vim /usr/local/hadoop/conf/hdfs-site.xml

Do setting with hadoop2, hadoop3 as same way

Do job

$hadoop namenode -format
namenode running as process 1172. Stop it first. datanode running as process 1368. Stop it first. starting datanode, logging to /usr/local/hadoop/libexec/../logs/hadoop-hadoop-datanode-ip-10-254-71-114.out starting datanode, logging to /usr/local/hadoop/libexec/../logs/hadoop-hadoop-datanode-ip-10-253-59-34.out secondarynamenode running as process 1577. Stop it first.
jobtracker running as process 1674. Stop it first. tasktracker running as process 1878. Stop it first. starting tasktracker, logging to /usr/local/hadoop/libexec/../logs/hadoop-hadoop-tasktracker-ip-10-254-71-114.out starting tasktracker, logging to /usr/local/hadoop/libexec/../logs/hadoop-hadoop-tasktracker-ip-10-253-59-34.out
1172 NameNode
1577 SecondaryNameNode
1368 DataNode
5848 Jps
1878 TaskTracker
1674 JobTracker

10732 TaskTracker
10576 DataNode
10785 Jps

10107 Jps
10054 TaskTracker
9898 DataNode

[default user@hadoop1]
$sudo vim /etc/hosts
private ip  <hadoop1 host name>
$sudo su - hadoop
$hadoop fs -ls output
$hadoop fs -rmr output
$cd hadoop-job
$hadoop fs -rmr input
$hadoop fs -put input input
$hadoop jar /usr/local/hadoop/hadoop-examples-1.1.2.jar wordcount input output
13/05/27 17:46:10 INFO input.FileInputFormat: Total input paths to process : 2
13/05/27 17:46:10 INFO util.NativeCodeLoader: Loaded the native-hadoop library
13/05/27 17:46:10 WARN snappy.LoadSnappy: Snappy native library not loaded
13/05/27 17:46:11 INFO mapred.JobClient: Running job: job_201305271601_0003
13/05/27 17:46:12 INFO mapred.JobClient:  map 0% reduce 0%
13/05/27 17:46:18 INFO mapred.JobClient:  map 100% reduce 0%
13/05/27 17:46:26 INFO mapred.JobClient:  map 100% reduce 33%
13/05/27 17:46:28 INFO mapred.JobClient:  map 100% reduce 100%
13/05/27 17:46:29 INFO mapred.JobClient: Job complete: job_201305271601_0003
13/05/27 17:46:29 INFO mapred.JobClient: Counters: 29
13/05/27 17:46:29 INFO mapred.JobClient:   Job Counters
13/05/27 17:46:29 INFO mapred.JobClient:     Launched reduce tasks=1
13/05/27 17:46:29 INFO mapred.JobClient:     SLOTS_MILLIS_MAPS=10234
13/05/27 17:46:29 INFO mapred.JobClient:     Total time spent by all reduces waiting after reserving slots (ms)=0
13/05/27 17:46:29 INFO mapred.JobClient:     Total time spent by all maps waiting after reserving slots (ms)=0
13/05/27 17:46:29 INFO mapred.JobClient:     Launched map tasks=2
13/05/27 17:46:29 INFO mapred.JobClient:     Data-local map tasks=2
13/05/27 17:46:29 INFO mapred.JobClient:     SLOTS_MILLIS_REDUCES=9374
13/05/27 17:46:29 INFO mapred.JobClient:   File Output Format Counters
13/05/27 17:46:29 INFO mapred.JobClient:     Bytes Written=12
13/05/27 17:46:29 INFO mapred.JobClient:   FileSystemCounters
13/05/27 17:46:29 INFO mapred.JobClient:     FILE_BYTES_READ=54
13/05/27 17:46:29 INFO mapred.JobClient:     HDFS_BYTES_READ=312
13/05/27 17:46:29 INFO mapred.JobClient:     FILE_BYTES_WRITTEN=156442
13/05/27 17:46:29 INFO mapred.JobClient:     HDFS_BYTES_WRITTEN=12
13/05/27 17:46:29 INFO mapred.JobClient:   File Input Format Counters
13/05/27 17:46:29 INFO mapred.JobClient:     Bytes Read=18
13/05/27 17:46:29 INFO mapred.JobClient:   Map-Reduce Framework
13/05/27 17:46:29 INFO mapred.JobClient:     Map output materialized bytes=60
13/05/27 17:46:29 INFO mapred.JobClient:     Map input records=2
13/05/27 17:46:29 INFO mapred.JobClient:     Reduce shuffle bytes=60
13/05/27 17:46:29 INFO mapred.JobClient:     Spilled Records=12
13/05/27 17:46:29 INFO mapred.JobClient:     Map output bytes=54
13/05/27 17:46:29 INFO mapred.JobClient:     CPU time spent (ms)=1310
13/05/27 17:46:29 INFO mapred.JobClient:     Total committed heap usage (bytes)=468058112
13/05/27 17:46:29 INFO mapred.JobClient:     Combine input records=9
13/05/27 17:46:29 INFO mapred.JobClient:     SPLIT_RAW_BYTES=294
13/05/27 17:46:29 INFO mapred.JobClient:     Reduce input records=6
13/05/27 17:46:29 INFO mapred.JobClient:     Reduce input groups=3
13/05/27 17:46:29 INFO mapred.JobClient:     Combine output records=6
13/05/27 17:46:29 INFO mapred.JobClient:     Physical memory (bytes) snapshot=490381312
13/05/27 17:46:29 INFO mapred.JobClient:     Reduce output records=3
13/05/27 17:46:29 INFO mapred.JobClient:     Virtual memory (bytes) snapshot=3258781696
13/05/27 17:46:29 INFO mapred.JobClient:     Map output records=9
$hadoop fs -cat output/part-r-00000
a       3
b       2
c       4