[sudoer] $sudo apt-get update $sudo apt-get install default-jdk $sudo adduser hadoop $sudo su - hadoop [hadoop user] $wget http://ftp.tsukuba.wide.ad.jp/software/apache/hadoop/common/stable/hadoop-1.1.2.tar.gz $tar zxvf hadoop-1.1.2.tar.gz $exit [sudoer] $sudo mv /home/hadoop/hadoop-1.1.2 /usr/local/hadoop $sudo chown hadoop:hadoop -R /usr/local/hadoop $sudo su - hadoop [hadoop user] $vim .bashrc export JAVA_HOME=/usr export HADOOP_INSTALL=/usr/local/hadoop export PATH=$HADOOP_INSTALL/bin:$JAVA_HOME/bin:$PATH $source .bashrc $hadoop version Hadoop 1.1.2 Subversion https://svn.apache.org/repos/asf/hadoop/common/branches/branch-1.1 -r 1440782 Compiled by hortonfo on Thu Jan 31 02:03:24 UTC 2013 From source with checksum c720ddcf4b926991de7467d253a79b8b $mkdir hadoop-job $cd hadoop-job $mkdir input $vim input/a a b c $vim input/b a a b c c c $hadoop jar /usr/local/hadoop/hadoop-examples-1.1.2.jar wordcount input output 13/05/27 15:16:16 INFO util.NativeCodeLoader: Loaded the native-hadoop library 13/05/27 15:16:16 INFO input.FileInputFormat: Total input paths to process : 1 13/05/27 15:16:16 WARN snappy.LoadSnappy: Snappy native library not loaded 13/05/27 15:16:16 INFO mapred.JobClient: Running job: job_local_0001 13/05/27 15:16:16 INFO util.ProcessTree: setsid exited with exit code 0 13/05/27 15:16:16 INFO mapred.Task: Using ResourceCalculatorPlugin : org.apache.hadoop.util.LinuxResourceCalculatorPlugin@3d3cdaa 13/05/27 15:16:16 INFO mapred.MapTask: io.sort.mb = 100 13/05/27 15:16:16 INFO mapred.MapTask: data buffer = 79691776/99614720 13/05/27 15:16:16 INFO mapred.MapTask: record buffer = 262144/327680 13/05/27 15:16:16 INFO mapred.MapTask: Starting flush of map output 13/05/27 15:16:17 INFO mapred.MapTask: Finished spill 0 13/05/27 15:16:17 INFO mapred.Task: Task:attempt_local_0001_m_000000_0 is done. And is in the process of commiting 13/05/27 15:16:17 INFO mapred.LocalJobRunner: 13/05/27 15:16:17 INFO mapred.Task: Task 'attempt_local_0001_m_000000_0' done. 13/05/27 15:16:17 INFO mapred.Task: Using ResourceCalculatorPlugin : org.apache.hadoop.util.LinuxResourceCalculatorPlugin@670fe2b9 13/05/27 15:16:17 INFO mapred.LocalJobRunner: 13/05/27 15:16:17 INFO mapred.Merger: Merging 1 sorted segments 13/05/27 15:16:17 INFO mapred.Merger: Down to the last merge-pass, with 1 segments left of total size: 26 bytes 13/05/27 15:16:17 INFO mapred.LocalJobRunner: 13/05/27 15:16:17 INFO mapred.Task: Task:attempt_local_0001_r_000000_0 is done. And is in the process of commiting 13/05/27 15:16:17 INFO mapred.LocalJobRunner: 13/05/27 15:16:17 INFO mapred.Task: Task attempt_local_0001_r_000000_0 is allowed to commit now 13/05/27 15:16:17 INFO output.FileOutputCommitter: Saved output of task 'attempt_local_0001_r_000000_0' to output 13/05/27 15:16:17 INFO mapred.LocalJobRunner: reduce > reduce 13/05/27 15:16:17 INFO mapred.Task: Task 'attempt_local_0001_r_000000_0' done. 13/05/27 15:16:17 INFO mapred.JobClient: map 100% reduce 100% 13/05/27 15:16:17 INFO mapred.JobClient: Job complete: job_local_0001 13/05/27 15:16:17 INFO mapred.JobClient: Counters: 20 13/05/27 15:16:17 INFO mapred.JobClient: File Output Format Counters 13/05/27 15:16:17 INFO mapred.JobClient: Bytes Written=24 13/05/27 15:16:17 INFO mapred.JobClient: FileSystemCounters 13/05/27 15:16:17 INFO mapred.JobClient: FILE_BYTES_READ=285258 13/05/27 15:16:17 INFO mapred.JobClient: FILE_BYTES_WRITTEN=384918 13/05/27 15:16:17 INFO mapred.JobClient: File Input Format Counters 13/05/27 15:16:17 INFO mapred.JobClient: Bytes Read=6 13/05/27 15:16:17 INFO mapred.JobClient: Map-Reduce Framework 13/05/27 15:16:17 INFO mapred.JobClient: Map output materialized bytes=30 13/05/27 15:16:17 INFO mapred.JobClient: Map input records=1 13/05/27 15:16:17 INFO mapred.JobClient: Reduce shuffle bytes=0 13/05/27 15:16:17 INFO mapred.JobClient: Spilled Records=6 13/05/27 15:16:17 INFO mapred.JobClient: Map output bytes=18 13/05/27 15:16:17 INFO mapred.JobClient: Total committed heap usage (bytes)=433455104 13/05/27 15:16:17 INFO mapred.JobClient: CPU time spent (ms)=0 13/05/27 15:16:17 INFO mapred.JobClient: SPLIT_RAW_BYTES=101 13/05/27 15:16:17 INFO mapred.JobClient: Combine input records=3 13/05/27 15:16:17 INFO mapred.JobClient: Reduce input records=3 13/05/27 15:16:17 INFO mapred.JobClient: Reduce input groups=3 13/05/27 15:16:17 INFO mapred.JobClient: Combine output records=3 13/05/27 15:16:17 INFO mapred.JobClient: Physical memory (bytes) snapshot=0 13/05/27 15:16:17 INFO mapred.JobClient: Reduce output records=3 13/05/27 15:16:17 INFO mapred.JobClient: Virtual memory (bytes) snapshot=0 13/05/27 15:16:17 INFO mapred.JobClient: Map output records=3 $ls input output $ls output/ part-r-00000 _SUCCESS $less output/part-r-00000 a 1 b 1 c 1
[hadoop user] $ssh-keygen -t dsa Empty passphrase $cat ~/.ssh/id_dsa.pub >> ~/.ssh/authorized_keys $chmod 600 ~/.ssh/authorized_keys $ssh localhost $exit $vim /usr/local/hadoop/conf/core-site.xml <configuration> <property> <name>fs.default.name</name> <value>hdfs://localhost:9000</value> </property> </configuration> $vim /usr/local/hadoop/conf/mapred-site.xml <configuration> <property> <name>mapred.job.tracker</name> <value>localhost:9001</value> </property> </configuration> $vim /usr/local/hadoop/conf/hdfs-site.xml <configuration> <property> <name>dfs.replication</name> <value>1</value> </property> </configuration> $vim /usr/local/hadoop/conf/hadoop-env.sh # The java implementation to use. Required. # export JAVA_HOME=/usr/lib/j2sdk1.5-sun export JAVA_HOME=/usr # Extra Java CLASSPATH elements. Optional. # export HADOOP_CLASSPATH= # The maximum amount of heap to use, in MB. Default is 1000. # export HADOOP_HEAPSIZE=2000 export HADOOP_HEAPSIZE=2046 $hadoop namenode -format 13/05/27 15:45:54 INFO namenode.NameNode: STARTUP_MSG: /************************************************************ STARTUP_MSG: Starting NameNode STARTUP_MSG: host = ip-10-254-0-175/10.254.0.175 STARTUP_MSG: args = [-format] STARTUP_MSG: version = 1.1.2 STARTUP_MSG: build = https://svn.apache.org/repos/asf/hadoop/common/branches/branch-1.1 -r 1440782; compiled by 'hortonfo' on Thu Jan 31 02:03:24 UTC 2013 ************************************************************/ Re-format filesystem in /tmp/hadoop-hadoop/dfs/name ? (Y or N) Y 13/05/27 15:45:57 INFO util.GSet: VM type = 64-bit 13/05/27 15:45:57 INFO util.GSet: 2% max memory = 17.77875 MB 13/05/27 15:45:57 INFO util.GSet: capacity = 2^21 = 2097152 entries 13/05/27 15:45:57 INFO util.GSet: recommended=2097152, actual=2097152 13/05/27 15:45:57 INFO namenode.FSNamesystem: fsOwner=hadoop 13/05/27 15:45:57 INFO namenode.FSNamesystem: supergroup=supergroup 13/05/27 15:45:57 INFO namenode.FSNamesystem: isPermissionEnabled=true 13/05/27 15:45:57 INFO namenode.FSNamesystem: dfs.block.invalidate.limit=100 13/05/27 15:45:57 INFO namenode.FSNamesystem: isAccessTokenEnabled=false accessKeyUpdateInterval=0 min(s), accessTokenLifetime=0 min(s) 13/05/27 15:45:57 INFO namenode.NameNode: Caching file names occuring more than 10 times 13/05/27 15:45:57 INFO common.Storage: Image file of size 112 saved in 0 seconds. 13/05/27 15:45:58 INFO namenode.FSEditLog: closing edit log: position=4, editlog=/tmp/hadoop-hadoop/dfs/name/current/edits 13/05/27 15:45:58 INFO namenode.FSEditLog: close success: truncate to 4, editlog=/tmp/hadoop-hadoop/dfs/name/current/edits 13/05/27 15:45:58 INFO common.Storage: Storage directory /tmp/hadoop-hadoop/dfs/name has been successfully formatted. 13/05/27 15:45:58 INFO namenode.NameNode: SHUTDOWN_MSG: /************************************************************ SHUTDOWN_MSG: Shutting down NameNode at ip-10-254-0-175/10.254.0.175 ************************************************************/ $start-all.sh namenode running as process 27482. Stop it first. localhost: starting datanode, logging to /usr/local/hadoop/libexec/../logs/hadoop-hadoop-datanode-ip-10-254-0-175.out localhost: secondarynamenode running as process 27907. Stop it first. jobtracker running as process 18517. Stop it first. localhost: starting tasktracker, logging to /usr/local/hadoop/libexec/../logs/hadoop-hadoop-tasktracker-ip-10-254-0-175.out $jps 28237 NodeManager 18517 JobTracker 28057 ResourceManager 20081 Jps 20004 TaskTracker 27482 NameNode 27907 SecondaryNameNode $hadoop fs -ls / Found 1 items drwxr-xr-x - hadoop supergroup 0 2013-05-27 16:01 /tmp $cd hadoop-job $hadoop fs -put input input $hadoop fs -ls input Found 1 items -rw-r--r-- 1 hadoop supergroup 6 2013-05-27 16:02 /user/hadoop/input/a $hadoop jar /usr/local/hadoop/hadoop-examples-1.1.2.jar wordcount input output 13/05/27 16:03:40 INFO input.FileInputFormat: Total input paths to process : 1 13/05/27 16:03:40 INFO util.NativeCodeLoader: Loaded the native-hadoop library 13/05/27 16:03:40 WARN snappy.LoadSnappy: Snappy native library not loaded 13/05/27 16:03:41 INFO mapred.JobClient: Running job: job_201305271601_0001 13/05/27 16:03:42 INFO mapred.JobClient: map 0% reduce 0% 13/05/27 16:03:48 INFO mapred.JobClient: map 100% reduce 0% 13/05/27 16:03:56 INFO mapred.JobClient: map 100% reduce 33% 13/05/27 16:03:57 INFO mapred.JobClient: map 100% reduce 100% 13/05/27 16:03:58 INFO mapred.JobClient: Job complete: job_201305271601_0001 13/05/27 16:03:58 INFO mapred.JobClient: Counters: 29 13/05/27 16:03:58 INFO mapred.JobClient: Job Counters 13/05/27 16:03:58 INFO mapred.JobClient: Launched reduce tasks=1 13/05/27 16:03:58 INFO mapred.JobClient: SLOTS_MILLIS_MAPS=5885 13/05/27 16:03:58 INFO mapred.JobClient: Total time spent by all reduces waiting after reserving slots (ms)=0 13/05/27 16:03:58 INFO mapred.JobClient: Total time spent by all maps waiting after reserving slots (ms)=0 13/05/27 16:03:58 INFO mapred.JobClient: Launched map tasks=1 13/05/27 16:03:58 INFO mapred.JobClient: Data-local map tasks=1 13/05/27 16:03:58 INFO mapred.JobClient: SLOTS_MILLIS_REDUCES=8987 13/05/27 16:03:58 INFO mapred.JobClient: File Output Format Counters 13/05/27 16:03:58 INFO mapred.JobClient: Bytes Written=12 13/05/27 16:03:58 INFO mapred.JobClient: FileSystemCounters 13/05/27 16:03:58 INFO mapred.JobClient: FILE_BYTES_READ=30 13/05/27 16:03:58 INFO mapred.JobClient: HDFS_BYTES_READ=112 13/05/27 16:03:58 INFO mapred.JobClient: FILE_BYTES_WRITTEN=103913 13/05/27 16:03:58 INFO mapred.JobClient: HDFS_BYTES_WRITTEN=12 13/05/27 16:03:58 INFO mapred.JobClient: File Input Format Counters 13/05/27 16:03:58 INFO mapred.JobClient: Bytes Read=6 13/05/27 16:03:58 INFO mapred.JobClient: Map-Reduce Framework 13/05/27 16:03:58 INFO mapred.JobClient: Map output materialized bytes=30 13/05/27 16:03:58 INFO mapred.JobClient: Map input records=1 13/05/27 16:03:58 INFO mapred.JobClient: Reduce shuffle bytes=30 13/05/27 16:03:58 INFO mapred.JobClient: Spilled Records=6 13/05/27 16:03:58 INFO mapred.JobClient: Map output bytes=18 13/05/27 16:03:58 INFO mapred.JobClient: CPU time spent (ms)=830 13/05/27 16:03:58 INFO mapred.JobClient: Total committed heap usage (bytes)=292552704 13/05/27 16:03:58 INFO mapred.JobClient: Combine input records=3 13/05/27 16:03:58 INFO mapred.JobClient: SPLIT_RAW_BYTES=106 13/05/27 16:03:58 INFO mapred.JobClient: Reduce input records=3 13/05/27 16:03:58 INFO mapred.JobClient: Reduce input groups=3 13/05/27 16:03:58 INFO mapred.JobClient: Combine output records=3 13/05/27 16:03:58 INFO mapred.JobClient: Physical memory (bytes) snapshot=290643968 13/05/27 16:03:58 INFO mapred.JobClient: Reduce output records=3 13/05/27 16:03:58 INFO mapred.JobClient: Virtual memory (bytes) snapshot=2163224576 13/05/27 16:03:58 INFO mapred.JobClient: Map output records=3 $hadoop fs -ls output Found 3 items -rw-r--r-- 1 hadoop supergroup 0 2013-05-27 16:03 /user/hadoop/output/_SUCCESS drwxr-xr-x - hadoop supergroup 0 2013-05-27 16:03 /user/hadoop/output/_logs -rw-r--r-- 1 hadoop supergroup 12 2013-05-27 16:03 /user/hadoop/output/part-r-00000 $hadoop fs -cat output/part-r-00000 a 3 b 2 c 4
[default user@hadoop1] $sudo cp /home/hadoop/.ssh/id_dsa.pub . $sudo chmod 644 id_dsa.pub Copy id_dsa.pub file to local with SCP client Copy id_dsa.pub file to hadoop2 server with SCP client Copy id_dsa.pub file to hadoop3 server with SCP client [default user@hadoop2] $sudo adduser hadoop $sudo su - hadoop [hadoop@hadoop2] $mkdir .ssh $exit [default user@hadoop2] $sudo mv id_dsa.pub /home/hadoop/.ssh/hadoop1.pub $sudo su - hadoop [hadoop@hadoop2] $cat ~/.ssh/hadoop1.pub >> ~/.ssh/authorized_keys $chmod 600 ~/.ssh/authorized_keys [hadoop@hadoop1] $ssh -i ~/.ssh/id_dsa <instans-name> $ssh <instans-name> [default user@hadoop3] $sudo adduser hadoop $sudo su - hadoop [hadoop@hadoop2] $mkdir .ssh $exit [default user@hadoop2] $sudo mv id_dsa.pub /home/hadoop/.ssh/hadoop1.pub $sudo su - hadoop [hadoop@hadoop2] $cat ~/.ssh/hadoop1.pub >> ~/.ssh/authorized_keys $chmod 600 ~/.ssh/authorized_keys [hadoop@hadoop1] $ssh -i ~/.ssh/id_dsa <instans-name> $ssh <instans-name> Do same thing for hadoop2, hadoop3
Install hadoop at hadoop2, hadoop3. [hadoop@hadoop1] $vim /usr/local/hadoop/conf/masters <hadoop1 host name> $vim /usr/local/hadoop/conf/slaves <hadoop1 host name> <hadoop2 host name> <hadoop3 host name> $vim /usr/local/hadoop/conf/core-site.xml <configuration> <property> <name>fs.default.name</name> <value>hdfs://<hadoop1 host name>:9010</value> </property> </configuration> $vim /usr/local/hadoop/conf/mapred-site.xml <configuration> <property> <name>mapred.job.tracker</name> <value><hadoop1 host name>:9011</value> </property> </configuration> $vim /usr/local/hadoop/conf/hdfs-site.xml <configuration> <property> <name>dfs.replication</name> <value>3</value> </property> </configuration> Do setting with hadoop2, hadoop3 as same way
Congrats![hadoop@hadoop1] $hadoop namenode -format $start-all.sh namenode running as process 1172. Stop it first. ec2-54-244-249-227.us-west-2.compute.amazonaws.com: datanode running as process 1368. Stop it first. ec2-54-245-99-29.us-west-2.compute.amazonaws.com: starting datanode, logging to /usr/local/hadoop/libexec/../logs/hadoop-hadoop-datanode-ip-10-254-71-114.out ec2-54-214-148-55.us-west-2.compute.amazonaws.com: starting datanode, logging to /usr/local/hadoop/libexec/../logs/hadoop-hadoop-datanode-ip-10-253-59-34.out ec2-54-244-249-227.us-west-2.compute.amazonaws.com: secondarynamenode running as process 1577. Stop it first. jobtracker running as process 1674. Stop it first. ec2-54-244-249-227.us-west-2.compute.amazonaws.com: tasktracker running as process 1878. Stop it first. ec2-54-245-99-29.us-west-2.compute.amazonaws.com: starting tasktracker, logging to /usr/local/hadoop/libexec/../logs/hadoop-hadoop-tasktracker-ip-10-254-71-114.out ec2-54-214-148-55.us-west-2.compute.amazonaws.com: starting tasktracker, logging to /usr/local/hadoop/libexec/../logs/hadoop-hadoop-tasktracker-ip-10-253-59-34.out $jps 1172 NameNode 1577 SecondaryNameNode 1368 DataNode 5848 Jps 1878 TaskTracker 1674 JobTracker [hadoop@hadoop2] $jps 10732 TaskTracker 10576 DataNode 10785 Jps [hadoop@hadoop3] $jps 10107 Jps 10054 TaskTracker 9898 DataNode [hadoop@hadoop1] $exit [default user@hadoop1] $sudo vim /etc/hosts private ip <hadoop1 host name> $sudo su - hadoop [hadoop@hadoop1] $hadoop fs -ls output $hadoop fs -rmr output $cd hadoop-job $hadoop fs -rmr input $hadoop fs -put input input $hadoop jar /usr/local/hadoop/hadoop-examples-1.1.2.jar wordcount input output 13/05/27 17:46:10 INFO input.FileInputFormat: Total input paths to process : 2 13/05/27 17:46:10 INFO util.NativeCodeLoader: Loaded the native-hadoop library 13/05/27 17:46:10 WARN snappy.LoadSnappy: Snappy native library not loaded 13/05/27 17:46:11 INFO mapred.JobClient: Running job: job_201305271601_0003 13/05/27 17:46:12 INFO mapred.JobClient: map 0% reduce 0% 13/05/27 17:46:18 INFO mapred.JobClient: map 100% reduce 0% 13/05/27 17:46:26 INFO mapred.JobClient: map 100% reduce 33% 13/05/27 17:46:28 INFO mapred.JobClient: map 100% reduce 100% 13/05/27 17:46:29 INFO mapred.JobClient: Job complete: job_201305271601_0003 13/05/27 17:46:29 INFO mapred.JobClient: Counters: 29 13/05/27 17:46:29 INFO mapred.JobClient: Job Counters 13/05/27 17:46:29 INFO mapred.JobClient: Launched reduce tasks=1 13/05/27 17:46:29 INFO mapred.JobClient: SLOTS_MILLIS_MAPS=10234 13/05/27 17:46:29 INFO mapred.JobClient: Total time spent by all reduces waiting after reserving slots (ms)=0 13/05/27 17:46:29 INFO mapred.JobClient: Total time spent by all maps waiting after reserving slots (ms)=0 13/05/27 17:46:29 INFO mapred.JobClient: Launched map tasks=2 13/05/27 17:46:29 INFO mapred.JobClient: Data-local map tasks=2 13/05/27 17:46:29 INFO mapred.JobClient: SLOTS_MILLIS_REDUCES=9374 13/05/27 17:46:29 INFO mapred.JobClient: File Output Format Counters 13/05/27 17:46:29 INFO mapred.JobClient: Bytes Written=12 13/05/27 17:46:29 INFO mapred.JobClient: FileSystemCounters 13/05/27 17:46:29 INFO mapred.JobClient: FILE_BYTES_READ=54 13/05/27 17:46:29 INFO mapred.JobClient: HDFS_BYTES_READ=312 13/05/27 17:46:29 INFO mapred.JobClient: FILE_BYTES_WRITTEN=156442 13/05/27 17:46:29 INFO mapred.JobClient: HDFS_BYTES_WRITTEN=12 13/05/27 17:46:29 INFO mapred.JobClient: File Input Format Counters 13/05/27 17:46:29 INFO mapred.JobClient: Bytes Read=18 13/05/27 17:46:29 INFO mapred.JobClient: Map-Reduce Framework 13/05/27 17:46:29 INFO mapred.JobClient: Map output materialized bytes=60 13/05/27 17:46:29 INFO mapred.JobClient: Map input records=2 13/05/27 17:46:29 INFO mapred.JobClient: Reduce shuffle bytes=60 13/05/27 17:46:29 INFO mapred.JobClient: Spilled Records=12 13/05/27 17:46:29 INFO mapred.JobClient: Map output bytes=54 13/05/27 17:46:29 INFO mapred.JobClient: CPU time spent (ms)=1310 13/05/27 17:46:29 INFO mapred.JobClient: Total committed heap usage (bytes)=468058112 13/05/27 17:46:29 INFO mapred.JobClient: Combine input records=9 13/05/27 17:46:29 INFO mapred.JobClient: SPLIT_RAW_BYTES=294 13/05/27 17:46:29 INFO mapred.JobClient: Reduce input records=6 13/05/27 17:46:29 INFO mapred.JobClient: Reduce input groups=3 13/05/27 17:46:29 INFO mapred.JobClient: Combine output records=6 13/05/27 17:46:29 INFO mapred.JobClient: Physical memory (bytes) snapshot=490381312 13/05/27 17:46:29 INFO mapred.JobClient: Reduce output records=3 13/05/27 17:46:29 INFO mapred.JobClient: Virtual memory (bytes) snapshot=3258781696 13/05/27 17:46:29 INFO mapred.JobClient: Map output records=9 $hadoop fs -cat output/part-r-00000 a 3 b 2 c 4
Streamlit is a …
I bought M5Stac…