Info
Instance : m1.large
Number of instances : 3
Stand Alone Mode
[sudoer]
$sudo apt-get update
$sudo apt-get install default-jdk
$sudo adduser hadoop
$sudo su - hadoop
[hadoop user]
$wget http://ftp.tsukuba.wide.ad.jp/software/apache/hadoop/common/stable/hadoop-1.1.2.tar.gz
$tar zxvf hadoop-1.1.2.tar.gz
$exit
[sudoer]
$sudo mv /home/hadoop/hadoop-1.1.2 /usr/local/hadoop
$sudo chown hadoop:hadoop -R /usr/local/hadoop
$sudo su - hadoop
[hadoop user]
$vim .bashrc
export JAVA_HOME=/usr
export HADOOP_INSTALL=/usr/local/hadoop
export PATH=$HADOOP_INSTALL/bin:$JAVA_HOME/bin:$PATH
$source .bashrc
$hadoop version
Hadoop 1.1.2
Subversion https://svn.apache.org/repos/asf/hadoop/common/branches/branch-1.1 -r 1440782
Compiled by hortonfo on Thu Jan 31 02:03:24 UTC 2013
From source with checksum c720ddcf4b926991de7467d253a79b8b
$mkdir hadoop-job
$cd hadoop-job
$mkdir input
$vim input/a
a b c
$vim input/b
a a b c c c
$hadoop jar /usr/local/hadoop/hadoop-examples-1.1.2.jar wordcount input output
13/05/27 15:16:16 INFO util.NativeCodeLoader: Loaded the native-hadoop library
13/05/27 15:16:16 INFO input.FileInputFormat: Total input paths to process : 1
13/05/27 15:16:16 WARN snappy.LoadSnappy: Snappy native library not loaded
13/05/27 15:16:16 INFO mapred.JobClient: Running job: job_local_0001
13/05/27 15:16:16 INFO util.ProcessTree: setsid exited with exit code 0
13/05/27 15:16:16 INFO mapred.Task: Using ResourceCalculatorPlugin : org.apache.hadoop.util.LinuxResourceCalculatorPlugin@3d3cdaa
13/05/27 15:16:16 INFO mapred.MapTask: io.sort.mb = 100
13/05/27 15:16:16 INFO mapred.MapTask: data buffer = 79691776/99614720
13/05/27 15:16:16 INFO mapred.MapTask: record buffer = 262144/327680
13/05/27 15:16:16 INFO mapred.MapTask: Starting flush of map output
13/05/27 15:16:17 INFO mapred.MapTask: Finished spill 0
13/05/27 15:16:17 INFO mapred.Task: Task:attempt_local_0001_m_000000_0 is done. And is in the process of commiting
13/05/27 15:16:17 INFO mapred.LocalJobRunner:
13/05/27 15:16:17 INFO mapred.Task: Task 'attempt_local_0001_m_000000_0' done.
13/05/27 15:16:17 INFO mapred.Task: Using ResourceCalculatorPlugin : org.apache.hadoop.util.LinuxResourceCalculatorPlugin@670fe2b9
13/05/27 15:16:17 INFO mapred.LocalJobRunner:
13/05/27 15:16:17 INFO mapred.Merger: Merging 1 sorted segments
13/05/27 15:16:17 INFO mapred.Merger: Down to the last merge-pass, with 1 segments left of total size: 26 bytes
13/05/27 15:16:17 INFO mapred.LocalJobRunner:
13/05/27 15:16:17 INFO mapred.Task: Task:attempt_local_0001_r_000000_0 is done. And is in the process of commiting
13/05/27 15:16:17 INFO mapred.LocalJobRunner:
13/05/27 15:16:17 INFO mapred.Task: Task attempt_local_0001_r_000000_0 is allowed to commit now
13/05/27 15:16:17 INFO output.FileOutputCommitter: Saved output of task 'attempt_local_0001_r_000000_0' to output
13/05/27 15:16:17 INFO mapred.LocalJobRunner: reduce > reduce
13/05/27 15:16:17 INFO mapred.Task: Task 'attempt_local_0001_r_000000_0' done.
13/05/27 15:16:17 INFO mapred.JobClient: map 100% reduce 100%
13/05/27 15:16:17 INFO mapred.JobClient: Job complete: job_local_0001
13/05/27 15:16:17 INFO mapred.JobClient: Counters: 20
13/05/27 15:16:17 INFO mapred.JobClient: File Output Format Counters
13/05/27 15:16:17 INFO mapred.JobClient: Bytes Written=24
13/05/27 15:16:17 INFO mapred.JobClient: FileSystemCounters
13/05/27 15:16:17 INFO mapred.JobClient: FILE_BYTES_READ=285258
13/05/27 15:16:17 INFO mapred.JobClient: FILE_BYTES_WRITTEN=384918
13/05/27 15:16:17 INFO mapred.JobClient: File Input Format Counters
13/05/27 15:16:17 INFO mapred.JobClient: Bytes Read=6
13/05/27 15:16:17 INFO mapred.JobClient: Map-Reduce Framework
13/05/27 15:16:17 INFO mapred.JobClient: Map output materialized bytes=30
13/05/27 15:16:17 INFO mapred.JobClient: Map input records=1
13/05/27 15:16:17 INFO mapred.JobClient: Reduce shuffle bytes=0
13/05/27 15:16:17 INFO mapred.JobClient: Spilled Records=6
13/05/27 15:16:17 INFO mapred.JobClient: Map output bytes=18
13/05/27 15:16:17 INFO mapred.JobClient: Total committed heap usage (bytes)=433455104
13/05/27 15:16:17 INFO mapred.JobClient: CPU time spent (ms)=0
13/05/27 15:16:17 INFO mapred.JobClient: SPLIT_RAW_BYTES=101
13/05/27 15:16:17 INFO mapred.JobClient: Combine input records=3
13/05/27 15:16:17 INFO mapred.JobClient: Reduce input records=3
13/05/27 15:16:17 INFO mapred.JobClient: Reduce input groups=3
13/05/27 15:16:17 INFO mapred.JobClient: Combine output records=3
13/05/27 15:16:17 INFO mapred.JobClient: Physical memory (bytes) snapshot=0
13/05/27 15:16:17 INFO mapred.JobClient: Reduce output records=3
13/05/27 15:16:17 INFO mapred.JobClient: Virtual memory (bytes) snapshot=0
13/05/27 15:16:17 INFO mapred.JobClient: Map output records=3
$ls
input output
$ls output/
part-r-00000 _SUCCESS
$less output/part-r-00000
a 1
b 1
c 1
Pseudo Distribution Mode
[hadoop user]
$ssh-keygen -t dsa
Empty passphrase
$cat ~/.ssh/id_dsa.pub >> ~/.ssh/authorized_keys
$chmod 600 ~/.ssh/authorized_keys
$ssh localhost
$exit
$vim /usr/local/hadoop/conf/core-site.xml
<configuration>
<property>
<name>fs.default.name</name>
<value>hdfs://localhost:9000</value>
</property>
</configuration>
$vim /usr/local/hadoop/conf/mapred-site.xml
<configuration>
<property>
<name>mapred.job.tracker</name>
<value>localhost:9001</value>
</property>
</configuration>
$vim /usr/local/hadoop/conf/hdfs-site.xml
<configuration>
<property>
<name>dfs.replication</name>
<value>1</value>
</property>
</configuration>
$vim /usr/local/hadoop/conf/hadoop-env.sh
# The java implementation to use. Required.
# export JAVA_HOME=/usr/lib/j2sdk1.5-sun
export JAVA_HOME=/usr
# Extra Java CLASSPATH elements. Optional.
# export HADOOP_CLASSPATH=
# The maximum amount of heap to use, in MB. Default is 1000.
# export HADOOP_HEAPSIZE=2000
export HADOOP_HEAPSIZE=2046
$hadoop namenode -format
13/05/27 15:45:54 INFO namenode.NameNode: STARTUP_MSG:
/************************************************************
STARTUP_MSG: Starting NameNode
STARTUP_MSG: host = ip-10-254-0-175/10.254.0.175
STARTUP_MSG: args = [-format]
STARTUP_MSG: version = 1.1.2
STARTUP_MSG: build = https://svn.apache.org/repos/asf/hadoop/common/branches/branch-1.1 -r 1440782; compiled by 'hortonfo' on Thu Jan 31 02:03:24 UTC 2013
************************************************************/
Re-format filesystem in /tmp/hadoop-hadoop/dfs/name ? (Y or N) Y
13/05/27 15:45:57 INFO util.GSet: VM type = 64-bit
13/05/27 15:45:57 INFO util.GSet: 2% max memory = 17.77875 MB
13/05/27 15:45:57 INFO util.GSet: capacity = 2^21 = 2097152 entries
13/05/27 15:45:57 INFO util.GSet: recommended=2097152, actual=2097152
13/05/27 15:45:57 INFO namenode.FSNamesystem: fsOwner=hadoop
13/05/27 15:45:57 INFO namenode.FSNamesystem: supergroup=supergroup
13/05/27 15:45:57 INFO namenode.FSNamesystem: isPermissionEnabled=true
13/05/27 15:45:57 INFO namenode.FSNamesystem: dfs.block.invalidate.limit=100
13/05/27 15:45:57 INFO namenode.FSNamesystem: isAccessTokenEnabled=false accessKeyUpdateInterval=0 min(s), accessTokenLifetime=0 min(s)
13/05/27 15:45:57 INFO namenode.NameNode: Caching file names occuring more than 10 times
13/05/27 15:45:57 INFO common.Storage: Image file of size 112 saved in 0 seconds.
13/05/27 15:45:58 INFO namenode.FSEditLog: closing edit log: position=4, editlog=/tmp/hadoop-hadoop/dfs/name/current/edits
13/05/27 15:45:58 INFO namenode.FSEditLog: close success: truncate to 4, editlog=/tmp/hadoop-hadoop/dfs/name/current/edits
13/05/27 15:45:58 INFO common.Storage: Storage directory /tmp/hadoop-hadoop/dfs/name has been successfully formatted.
13/05/27 15:45:58 INFO namenode.NameNode: SHUTDOWN_MSG:
/************************************************************
SHUTDOWN_MSG: Shutting down NameNode at ip-10-254-0-175/10.254.0.175
************************************************************/
$start-all.sh
namenode running as process 27482. Stop it first.
localhost: starting datanode, logging to /usr/local/hadoop/libexec/../logs/hadoop-hadoop-datanode-ip-10-254-0-175.out
localhost: secondarynamenode running as process 27907. Stop it first.
jobtracker running as process 18517. Stop it first.
localhost: starting tasktracker, logging to /usr/local/hadoop/libexec/../logs/hadoop-hadoop-tasktracker-ip-10-254-0-175.out
$jps
28237 NodeManager
18517 JobTracker
28057 ResourceManager
20081 Jps
20004 TaskTracker
27482 NameNode
27907 SecondaryNameNode
$hadoop fs -ls /
Found 1 items
drwxr-xr-x - hadoop supergroup 0 2013-05-27 16:01 /tmp
$cd hadoop-job
$hadoop fs -put input input
$hadoop fs -ls input
Found 1 items
-rw-r--r-- 1 hadoop supergroup 6 2013-05-27 16:02 /user/hadoop/input/a
$hadoop jar /usr/local/hadoop/hadoop-examples-1.1.2.jar wordcount input output
13/05/27 16:03:40 INFO input.FileInputFormat: Total input paths to process : 1
13/05/27 16:03:40 INFO util.NativeCodeLoader: Loaded the native-hadoop library
13/05/27 16:03:40 WARN snappy.LoadSnappy: Snappy native library not loaded
13/05/27 16:03:41 INFO mapred.JobClient: Running job: job_201305271601_0001
13/05/27 16:03:42 INFO mapred.JobClient: map 0% reduce 0%
13/05/27 16:03:48 INFO mapred.JobClient: map 100% reduce 0%
13/05/27 16:03:56 INFO mapred.JobClient: map 100% reduce 33%
13/05/27 16:03:57 INFO mapred.JobClient: map 100% reduce 100%
13/05/27 16:03:58 INFO mapred.JobClient: Job complete: job_201305271601_0001
13/05/27 16:03:58 INFO mapred.JobClient: Counters: 29
13/05/27 16:03:58 INFO mapred.JobClient: Job Counters
13/05/27 16:03:58 INFO mapred.JobClient: Launched reduce tasks=1
13/05/27 16:03:58 INFO mapred.JobClient: SLOTS_MILLIS_MAPS=5885
13/05/27 16:03:58 INFO mapred.JobClient: Total time spent by all reduces waiting after reserving slots (ms)=0
13/05/27 16:03:58 INFO mapred.JobClient: Total time spent by all maps waiting after reserving slots (ms)=0
13/05/27 16:03:58 INFO mapred.JobClient: Launched map tasks=1
13/05/27 16:03:58 INFO mapred.JobClient: Data-local map tasks=1
13/05/27 16:03:58 INFO mapred.JobClient: SLOTS_MILLIS_REDUCES=8987
13/05/27 16:03:58 INFO mapred.JobClient: File Output Format Counters
13/05/27 16:03:58 INFO mapred.JobClient: Bytes Written=12
13/05/27 16:03:58 INFO mapred.JobClient: FileSystemCounters
13/05/27 16:03:58 INFO mapred.JobClient: FILE_BYTES_READ=30
13/05/27 16:03:58 INFO mapred.JobClient: HDFS_BYTES_READ=112
13/05/27 16:03:58 INFO mapred.JobClient: FILE_BYTES_WRITTEN=103913
13/05/27 16:03:58 INFO mapred.JobClient: HDFS_BYTES_WRITTEN=12
13/05/27 16:03:58 INFO mapred.JobClient: File Input Format Counters
13/05/27 16:03:58 INFO mapred.JobClient: Bytes Read=6
13/05/27 16:03:58 INFO mapred.JobClient: Map-Reduce Framework
13/05/27 16:03:58 INFO mapred.JobClient: Map output materialized bytes=30
13/05/27 16:03:58 INFO mapred.JobClient: Map input records=1
13/05/27 16:03:58 INFO mapred.JobClient: Reduce shuffle bytes=30
13/05/27 16:03:58 INFO mapred.JobClient: Spilled Records=6
13/05/27 16:03:58 INFO mapred.JobClient: Map output bytes=18
13/05/27 16:03:58 INFO mapred.JobClient: CPU time spent (ms)=830
13/05/27 16:03:58 INFO mapred.JobClient: Total committed heap usage (bytes)=292552704
13/05/27 16:03:58 INFO mapred.JobClient: Combine input records=3
13/05/27 16:03:58 INFO mapred.JobClient: SPLIT_RAW_BYTES=106
13/05/27 16:03:58 INFO mapred.JobClient: Reduce input records=3
13/05/27 16:03:58 INFO mapred.JobClient: Reduce input groups=3
13/05/27 16:03:58 INFO mapred.JobClient: Combine output records=3
13/05/27 16:03:58 INFO mapred.JobClient: Physical memory (bytes) snapshot=290643968
13/05/27 16:03:58 INFO mapred.JobClient: Reduce output records=3
13/05/27 16:03:58 INFO mapred.JobClient: Virtual memory (bytes) snapshot=2163224576
13/05/27 16:03:58 INFO mapred.JobClient: Map output records=3
$hadoop fs -ls output
Found 3 items
-rw-r--r-- 1 hadoop supergroup 0 2013-05-27 16:03 /user/hadoop/output/_SUCCESS
drwxr-xr-x - hadoop supergroup 0 2013-05-27 16:03 /user/hadoop/output/_logs
-rw-r--r-- 1 hadoop supergroup 12 2013-05-27 16:03 /user/hadoop/output/part-r-00000
$hadoop fs -cat output/part-r-00000
a 3
b 2
c 4
Complete Distribution Mode
Key Change
[default user@hadoop1]
$sudo cp /home/hadoop/.ssh/id_dsa.pub .
$sudo chmod 644 id_dsa.pub
Copy id_dsa.pub file to local with SCP client
Copy id_dsa.pub file to hadoop2 server with SCP client
Copy id_dsa.pub file to hadoop3 server with SCP client
[default user@hadoop2]
$sudo adduser hadoop
$sudo su - hadoop
[hadoop@hadoop2]
$mkdir .ssh
$exit
[default user@hadoop2]
$sudo mv id_dsa.pub /home/hadoop/.ssh/hadoop1.pub
$sudo su - hadoop
[hadoop@hadoop2]
$cat ~/.ssh/hadoop1.pub >> ~/.ssh/authorized_keys
$chmod 600 ~/.ssh/authorized_keys
[hadoop@hadoop1]
$ssh -i ~/.ssh/id_dsa <instans-name>
$ssh <instans-name>
[default user@hadoop3]
$sudo adduser hadoop
$sudo su - hadoop
[hadoop@hadoop2]
$mkdir .ssh
$exit
[default user@hadoop2]
$sudo mv id_dsa.pub /home/hadoop/.ssh/hadoop1.pub
$sudo su - hadoop
[hadoop@hadoop2]
$cat ~/.ssh/hadoop1.pub >> ~/.ssh/authorized_keys
$chmod 600 ~/.ssh/authorized_keys
[hadoop@hadoop1]
$ssh -i ~/.ssh/id_dsa <instans-name>
$ssh <instans-name>
Do same thing for hadoop2, hadoop3
Setting
Install hadoop at hadoop2, hadoop3.
[hadoop@hadoop1]
$vim /usr/local/hadoop/conf/masters
<hadoop1 host name>
$vim /usr/local/hadoop/conf/slaves
<hadoop1 host name>
<hadoop2 host name>
<hadoop3 host name>
$vim /usr/local/hadoop/conf/core-site.xml
<configuration>
<property>
<name>fs.default.name</name>
<value>hdfs://<hadoop1 host name>:9010</value>
</property>
</configuration>
$vim /usr/local/hadoop/conf/mapred-site.xml
<configuration>
<property>
<name>mapred.job.tracker</name>
<value><hadoop1 host name>:9011</value>
</property>
</configuration>
$vim /usr/local/hadoop/conf/hdfs-site.xml
<configuration>
<property>
<name>dfs.replication</name>
<value>3</value>
</property>
</configuration>
Do setting with hadoop2, hadoop3 as same way
Do job
[hadoop@hadoop1]
$hadoop namenode -format
$start-all.sh
namenode running as process 1172. Stop it first.
ec2-54-244-249-227.us-west-2.compute.amazonaws.com: datanode running as process 1368. Stop it first.
ec2-54-245-99-29.us-west-2.compute.amazonaws.com: starting datanode, logging to /usr/local/hadoop/libexec/../logs/hadoop-hadoop-datanode-ip-10-254-71-114.out
ec2-54-214-148-55.us-west-2.compute.amazonaws.com: starting datanode, logging to /usr/local/hadoop/libexec/../logs/hadoop-hadoop-datanode-ip-10-253-59-34.out
ec2-54-244-249-227.us-west-2.compute.amazonaws.com: secondarynamenode running as process 1577. Stop it first.
jobtracker running as process 1674. Stop it first.
ec2-54-244-249-227.us-west-2.compute.amazonaws.com: tasktracker running as process 1878. Stop it first.
ec2-54-245-99-29.us-west-2.compute.amazonaws.com: starting tasktracker, logging to /usr/local/hadoop/libexec/../logs/hadoop-hadoop-tasktracker-ip-10-254-71-114.out
ec2-54-214-148-55.us-west-2.compute.amazonaws.com: starting tasktracker, logging to /usr/local/hadoop/libexec/../logs/hadoop-hadoop-tasktracker-ip-10-253-59-34.out
$jps
1172 NameNode
1577 SecondaryNameNode
1368 DataNode
5848 Jps
1878 TaskTracker
1674 JobTracker
[hadoop@hadoop2]
$jps
10732 TaskTracker
10576 DataNode
10785 Jps
[hadoop@hadoop3]
$jps
10107 Jps
10054 TaskTracker
9898 DataNode
[hadoop@hadoop1]
$exit
[default user@hadoop1]
$sudo vim /etc/hosts
private ip <hadoop1 host name>
$sudo su - hadoop
[hadoop@hadoop1]
$hadoop fs -ls output
$hadoop fs -rmr output
$cd hadoop-job
$hadoop fs -rmr input
$hadoop fs -put input input
$hadoop jar /usr/local/hadoop/hadoop-examples-1.1.2.jar wordcount input output
13/05/27 17:46:10 INFO input.FileInputFormat: Total input paths to process : 2
13/05/27 17:46:10 INFO util.NativeCodeLoader: Loaded the native-hadoop library
13/05/27 17:46:10 WARN snappy.LoadSnappy: Snappy native library not loaded
13/05/27 17:46:11 INFO mapred.JobClient: Running job: job_201305271601_0003
13/05/27 17:46:12 INFO mapred.JobClient: map 0% reduce 0%
13/05/27 17:46:18 INFO mapred.JobClient: map 100% reduce 0%
13/05/27 17:46:26 INFO mapred.JobClient: map 100% reduce 33%
13/05/27 17:46:28 INFO mapred.JobClient: map 100% reduce 100%
13/05/27 17:46:29 INFO mapred.JobClient: Job complete: job_201305271601_0003
13/05/27 17:46:29 INFO mapred.JobClient: Counters: 29
13/05/27 17:46:29 INFO mapred.JobClient: Job Counters
13/05/27 17:46:29 INFO mapred.JobClient: Launched reduce tasks=1
13/05/27 17:46:29 INFO mapred.JobClient: SLOTS_MILLIS_MAPS=10234
13/05/27 17:46:29 INFO mapred.JobClient: Total time spent by all reduces waiting after reserving slots (ms)=0
13/05/27 17:46:29 INFO mapred.JobClient: Total time spent by all maps waiting after reserving slots (ms)=0
13/05/27 17:46:29 INFO mapred.JobClient: Launched map tasks=2
13/05/27 17:46:29 INFO mapred.JobClient: Data-local map tasks=2
13/05/27 17:46:29 INFO mapred.JobClient: SLOTS_MILLIS_REDUCES=9374
13/05/27 17:46:29 INFO mapred.JobClient: File Output Format Counters
13/05/27 17:46:29 INFO mapred.JobClient: Bytes Written=12
13/05/27 17:46:29 INFO mapred.JobClient: FileSystemCounters
13/05/27 17:46:29 INFO mapred.JobClient: FILE_BYTES_READ=54
13/05/27 17:46:29 INFO mapred.JobClient: HDFS_BYTES_READ=312
13/05/27 17:46:29 INFO mapred.JobClient: FILE_BYTES_WRITTEN=156442
13/05/27 17:46:29 INFO mapred.JobClient: HDFS_BYTES_WRITTEN=12
13/05/27 17:46:29 INFO mapred.JobClient: File Input Format Counters
13/05/27 17:46:29 INFO mapred.JobClient: Bytes Read=18
13/05/27 17:46:29 INFO mapred.JobClient: Map-Reduce Framework
13/05/27 17:46:29 INFO mapred.JobClient: Map output materialized bytes=60
13/05/27 17:46:29 INFO mapred.JobClient: Map input records=2
13/05/27 17:46:29 INFO mapred.JobClient: Reduce shuffle bytes=60
13/05/27 17:46:29 INFO mapred.JobClient: Spilled Records=12
13/05/27 17:46:29 INFO mapred.JobClient: Map output bytes=54
13/05/27 17:46:29 INFO mapred.JobClient: CPU time spent (ms)=1310
13/05/27 17:46:29 INFO mapred.JobClient: Total committed heap usage (bytes)=468058112
13/05/27 17:46:29 INFO mapred.JobClient: Combine input records=9
13/05/27 17:46:29 INFO mapred.JobClient: SPLIT_RAW_BYTES=294
13/05/27 17:46:29 INFO mapred.JobClient: Reduce input records=6
13/05/27 17:46:29 INFO mapred.JobClient: Reduce input groups=3
13/05/27 17:46:29 INFO mapred.JobClient: Combine output records=6
13/05/27 17:46:29 INFO mapred.JobClient: Physical memory (bytes) snapshot=490381312
13/05/27 17:46:29 INFO mapred.JobClient: Reduce output records=3
13/05/27 17:46:29 INFO mapred.JobClient: Virtual memory (bytes) snapshot=3258781696
13/05/27 17:46:29 INFO mapred.JobClient: Map output records=9
$hadoop fs -cat output/part-r-00000
a 3
b 2
c 4
Congrats!