From b155c89600a02c67466de722c8b270062f7d3ad5 Mon Sep 17 00:00:00 2001 From: Ivan Ermilov Date: Wed, 11 May 2016 14:41:37 +0200 Subject: [PATCH] merged Hadoop from HDFS Workbench into v1.0.0 --- base/Dockerfile | 68 ++++++++++++++++++++++++++++++++ base/entrypoint.sh | 81 ++++++++++++++++++++++++++++++++++++++ datanode/Dockerfile | 11 ++++++ datanode/run.sh | 9 +++++ docker-compose.yml | 63 +++++++++++++++++++++++++++++ hadoop-base/Dockerfile | 30 -------------- hadoop-base/core-site.xml | 24 ----------- hadoop-base/hdfs-site.xml | 40 ------------------- hadoop-datanode/Dockerfile | 5 --- hadoop-namenode/Dockerfile | 7 ---- hadoop.env | 24 +++++++++++ historyserver/Dockerfile | 11 ++++++ historyserver/run.sh | 3 ++ namenode/Dockerfile | 11 ++++++ namenode/run.sh | 19 +++++++++ nodemanager/Dockerfile | 7 ++++ nodemanager/run.sh | 3 ++ resourcemanager/Dockerfile | 7 ++++ resourcemanager/run.sh | 3 ++ 19 files changed, 320 insertions(+), 106 deletions(-) create mode 100644 base/Dockerfile create mode 100644 base/entrypoint.sh create mode 100644 datanode/Dockerfile create mode 100644 datanode/run.sh create mode 100644 docker-compose.yml delete mode 100644 hadoop-base/Dockerfile delete mode 100644 hadoop-base/core-site.xml delete mode 100644 hadoop-base/hdfs-site.xml delete mode 100644 hadoop-datanode/Dockerfile delete mode 100644 hadoop-namenode/Dockerfile create mode 100644 hadoop.env create mode 100644 historyserver/Dockerfile create mode 100644 historyserver/run.sh create mode 100644 namenode/Dockerfile create mode 100644 namenode/run.sh create mode 100644 nodemanager/Dockerfile create mode 100644 nodemanager/run.sh create mode 100644 resourcemanager/Dockerfile create mode 100644 resourcemanager/run.sh diff --git a/base/Dockerfile b/base/Dockerfile new file mode 100644 index 0000000..8c3b2fd --- /dev/null +++ b/base/Dockerfile @@ -0,0 +1,68 @@ +FROM debian:7 +MAINTAINER Ivan Ermilov + +RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends openjdk-7-jdk +ENV JAVA_HOME=/usr/lib/jvm/java-7-openjdk-amd64/ + +RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends net-tools curl + +RUN gpg --keyserver pool.sks-keyservers.net --recv-keys \ + 07617D4968B34D8F13D56E20BE5AAA0BA210C095 \ + 2CAC83124870D88586166115220F69801F27E622 \ + 4B96409A098DBD511DF2BC18DBAF69BEA7239D59 \ + 9DD955653083EFED6171256408458C39E964B5FF \ + B6B3F7EDA5BA7D1E827DE5180DFF492D8EE2F25C \ + 6A67379BEFC1AE4D5595770A34005598B8F47547 \ + 47660BC98BC433F01E5C90581209E7F13D0C92B9 \ + CE83449FDC6DACF9D24174DCD1F99F6EE3CD2163 \ + A11DF05DEA40DA19CE4B43C01214CF3F852ADB85 \ + 686E5EDF04A4830554160910DF0F5BBC30CD0996 \ + 5BAE7CB144D05AD1BB1C47C75C6CC6EFABE49180 \ + AF7610D2E378B33AB026D7574FB955854318F669 \ + 6AE70A2A38F466A5D683F939255ADF56C36C5F0F \ + 70F7AB3B62257ABFBD0618D79FDB12767CC7352A \ + 842AAB2D0BC5415B4E19D429A342433A56D8D31A \ + 1B5D384B734F368052862EB55E43CAB9AEC77EAF \ + 785436A782586B71829C67A04169AA27ECB31663 \ + 5E49DA09E2EC9950733A4FF48F1895E97869A2FB \ + A13B3869454536F1852C17D0477E02D33DD51430 \ + A6220FFCC86FE81CE5AAC880E3814B59E4E11856 \ + EFE2E7C571309FE00BEBA78D5E314EEF7340E1CB \ + EB34498A9261F343F09F60E0A9510905F0B000F0 \ + 3442A6594268AC7B88F5C1D25104A731B021B57F \ + 6E83C32562C909D289E6C3D98B25B9B71EFF7770 \ + E9216532BF11728C86A11E3132CF4BF4E72E74D3 \ + E8966520DA24E9642E119A5F13971DA39475BD5D \ + 1D369094D4CFAC140E0EF05E992230B1EB8C6EFA \ + A312CE6A1FA98892CB2C44EBA79AB712DE5868E6 \ + 0445B7BFC4515847C157ECD16BA72FF1C99785DE \ + B74F188889D159F3D7E64A7F348C6D7A0DCED714 \ + 4A6AC5C675B6155682729C9E08D51A0A7501105C \ + 8B44A05C308955D191956559A5CEE20A90348D47 + +ENV HADOOP_VERSION 2.7.1 +ENV HADOOP_URL https://www.apache.org/dist/hadoop/common/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz +RUN set -x \ + && curl -fSL "$HADOOP_URL" -o /tmp/hadoop.tar.gz \ + && curl -fSL "$HADOOP_URL.asc" -o /tmp/hadoop.tar.gz.asc \ + && gpg --verify /tmp/hadoop.tar.gz.asc \ + && tar -xvf /tmp/hadoop.tar.gz -C /opt/ \ + && rm /tmp/hadoop.tar.gz* + +RUN ln -s /opt/hadoop-$HADOOP_VERSION/etc/hadoop /etc/hadoop +RUN cp /etc/hadoop/mapred-site.xml.template /etc/hadoop/mapred-site.xml +RUN mkdir /opt/hadoop-$HADOOP_VERSION/logs + +RUN mkdir /hadoop-data + +ENV HADOOP_PREFIX=/opt/hadoop-$HADOOP_VERSION +ENV HADOOP_CONF_DIR=/etc/hadoop +ENV MULTIHOMED_NETWORK=1 + +ENV USER=root +ENV PATH $HADOOP_PREFIX/bin/:$PATH + +ADD entrypoint.sh /entrypoint.sh +RUN chmod a+x /entrypoint.sh + +ENTRYPOINT ["/entrypoint.sh"] diff --git a/base/entrypoint.sh b/base/entrypoint.sh new file mode 100644 index 0000000..d7b049b --- /dev/null +++ b/base/entrypoint.sh @@ -0,0 +1,81 @@ +#!/bin/bash + +# Set some sensible defaults +export CORE_CONF_fs_defaultFS=${CORE_CONF_fs_defaultFS:-hdfs://`hostname -f`:8020} + +function addProperty() { + local path=$1 + local name=$2 + local value=$3 + + local entry="$name${value}" + local escapedEntry=$(echo $entry | sed 's/\//\\\//g') + sed -i "/<\/configuration>/ s/.*/${escapedEntry}\n&/" $path +} + +function configure() { + local path=$1 + local module=$2 + local envPrefix=$3 + + local var + local value + + echo "Configuring $module" + for c in `printenv | perl -sne 'print "$1 " if m/^${envPrefix}_(.+?)=.*/' -- -envPrefix=$envPrefix`; do + name=`echo ${c} | perl -pe 's/___/-/g; s/__/_/g; s/_/./g'` + var="${envPrefix}_${c}" + value=${!var} + echo " - Setting $name=$value" + addProperty /etc/hadoop/$module-site.xml $name "$value" + done +} + +configure /etc/hadoop/core-site.xml core CORE_CONF +configure /etc/hadoop/hdfs-site.xml hdfs HDFS_CONF +configure /etc/hadoop/yarn-site.xml yarn YARN_CONF +configure /etc/hadoop/httpfs-site.xml httpfs HTTPFS_CONF +configure /etc/hadoop/kms-site.xml kms KMS_CONF + +if [ "$MULTIHOMED_NETWORK" = "1" ]; then + echo "Configuring for multihomed network" + + # HDFS + addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.rpc-bind-host 0.0.0.0 + addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.servicerpc-bind-host 0.0.0.0 + addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.http-bind-host 0.0.0.0 + addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.https-bind-host 0.0.0.0 + addProperty /etc/hadoop/hdfs-site.xml dfs.client.use.datanode.hostname true + addProperty /etc/hadoop/hdfs-site.xml dfs.datanode.use.datanode.hostname true + + # YARN + addProperty /etc/hadoop/yarn-site.xml yarn.resourcemanager.bind-host 0.0.0.0 + addProperty /etc/hadoop/yarn-site.xml yarn.nodemanager.bind-host 0.0.0.0 + addProperty /etc/hadoop/yarn-site.xml yarn.nodemanager.bind-host 0.0.0.0 + addProperty /etc/hadoop/yarn-site.xml yarn.timeline-service.bind-host 0.0.0.0 + + # MAPRED + addProperty /etc/hadoop/mapred-site.xml yarn.nodemanager.bind-host 0.0.0.0 +fi + +if [ -n "$GANGLIA_HOST" ]; then + mv /etc/hadoop/hadoop-metrics.properties /etc/hadoop/hadoop-metrics.properties.orig + mv /etc/hadoop/hadoop-metrics2.properties /etc/hadoop/hadoop-metrics2.properties.orig + + for module in mapred jvm rpc ugi; do + echo "$module.class=org.apache.hadoop.metrics.ganglia.GangliaContext31" + echo "$module.period=10" + echo "$module.servers=$GANGLIA_HOST:8649" + done > /etc/hadoop/hadoop-metrics.properties + + for module in namenode datanode resourcemanager nodemanager mrappmaster jobhistoryserver; do + echo "$module.sink.ganglia.class=org.apache.hadoop.metrics2.sink.ganglia.GangliaSink31" + echo "$module.sink.ganglia.period=10" + echo "$module.sink.ganglia.supportsparse=true" + echo "$module.sink.ganglia.slope=jvm.metrics.gcCount=zero,jvm.metrics.memHeapUsedM=both" + echo "$module.sink.ganglia.dmax=jvm.metrics.threadsBlocked=70,jvm.metrics.memHeapUsedM=40" + echo "$module.sink.ganglia.servers=$GANGLIA_HOST:8649" + done > /etc/hadoop/hadoop-metrics2.properties +fi + +exec $@ diff --git a/datanode/Dockerfile b/datanode/Dockerfile new file mode 100644 index 0000000..44bc712 --- /dev/null +++ b/datanode/Dockerfile @@ -0,0 +1,11 @@ +FROM bde2020/hadoop-base:1.0.0 +MAINTAINER Ivan Ermilov + +ENV HDFS_CONF_dfs_datanode_data_dir=file:///hadoop/dfs/data +RUN mkdir -p /hadoop/dfs/data +VOLUME /hadoop/dfs/data + +ADD run.sh /run.sh +RUN chmod a+x /run.sh + +CMD ["/run.sh"] diff --git a/datanode/run.sh b/datanode/run.sh new file mode 100644 index 0000000..9f57ee2 --- /dev/null +++ b/datanode/run.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +datadir=`echo $HDFS_CONF_dfs_datanode_data_dir | perl -pe 's#file://##'` +if [ ! -d $datadir ]; then + echo "Datanode data directory not found: $datadir" + exit 2 +fi + +$HADOOP_PREFIX/bin/hdfs --config $HADOOP_CONF_DIR datanode diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..acf0715 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,63 @@ +namenode: + image: bde2020/hadoop-namenode:1.0.0 + hostname: namenode + container_name: namenode + domainname: hadoop + net: hadoop + volumes: + - ./data/namenode:/hadoop/dfs/name + environment: + - CLUSTER_NAME=test + env_file: + - ./hadoop.env + +resourcemanager: + image: bde2020/hadoop-resourcemanager:1.0.0 + hostname: resourcemanager + container_name: resourcemanager + domainname: hadoop + net: hadoop + env_file: + - ./hadoop.env + +historyserver: + image: bde2020/hadoop-historyserver:1.0.0 + hostname: historyserver + container_name: historyserver + domainname: hadoop + net: hadoop + volumes: + - historyserver:/hadoop/yarn/timeline + env_file: + - ./hadoop.env + +nodemanager1: + image: bde2020/hadoop-nodemanager:1.0.0 + hostname: nodemanager1 + container_name: nodemanager1 + domainname: hadoop + net: hadoop + env_file: + - ./hadoop.env + +datanode1: + image: uhopper/hadoop-datanode:1.0.0 + hostname: datanode1 + container_name: datanode1 + domainname: hadoop + net: hadoop + volumes: + - ./data/datanode1:/hadoop/dfs/data + env_file: + - ./hadoop.env + +datanode2: + image: uhopper/hadoop-datanode:1.0.0 + hostname: datanode2 + container_name: datanode2 + domainname: hadoop + net: hadoop + volumes: + - ./data/datanode2:/hadoop/dfs/data + env_file: + - ./hadoop.env diff --git a/hadoop-base/Dockerfile b/hadoop-base/Dockerfile deleted file mode 100644 index f775c73..0000000 --- a/hadoop-base/Dockerfile +++ /dev/null @@ -1,30 +0,0 @@ -FROM java:8-jre - -MAINTAINER Yiannis Mouchakis - -# define hadoop version -ENV HADOOP_VERSION 2.7.1 - -# Hadoop env variables -ENV HADOOP_PREFIX /opt/hadoop -ENV HADOOP_CONF_DIR $HADOOP_PREFIX/conf -ENV PATH $PATH:$HADOOP_PREFIX/bin -ENV PATH $PATH:$HADOOP_PREFIX/sbin - -RUN apt-get update && apt-get install -y \ - wget \ - tar \ - && apt-get clean \ - && rm -rf /var/lib/apt/lists/* - -# deploy hadoop -RUN wget http://archive.apache.org/dist/hadoop/core/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz -RUN tar -zxf /hadoop-$HADOOP_VERSION.tar.gz -RUN rm /hadoop-$HADOOP_VERSION.tar.gz -RUN mv hadoop-$HADOOP_VERSION $HADOOP_PREFIX - -# add configuration files -ADD core-site.xml $HADOOP_CONF_DIR/core-site.xml -ADD hdfs-site.xml $HADOOP_CONF_DIR/hdfs-site.xml - -CMD hdfs namenode -format -nonInteractive & hdfs namenode && hdfs datanode diff --git a/hadoop-base/core-site.xml b/hadoop-base/core-site.xml deleted file mode 100644 index 0a336c8..0000000 --- a/hadoop-base/core-site.xml +++ /dev/null @@ -1,24 +0,0 @@ - - - - - - - - - fs.defaultFS - hdfs://namenode:8020 - - diff --git a/hadoop-base/hdfs-site.xml b/hadoop-base/hdfs-site.xml deleted file mode 100644 index f453848..0000000 --- a/hadoop-base/hdfs-site.xml +++ /dev/null @@ -1,40 +0,0 @@ - - - - - - - - dfs.replication - 2 - - - dfs.datanode.data.dir - /hdfs-data/datanode - - - dfs.namenode.name.dir - /hdfs-data/namenode - - - dfs.namenode.datanode.registration.ip-hostname-check - false - - - dfs.permissions.enabled - false - - - diff --git a/hadoop-datanode/Dockerfile b/hadoop-datanode/Dockerfile deleted file mode 100644 index 582c240..0000000 --- a/hadoop-datanode/Dockerfile +++ /dev/null @@ -1,5 +0,0 @@ -FROM bde2020/hadoop-base - -MAINTAINER Yiannis Mouchakis - -CMD hdfs datanode \ No newline at end of file diff --git a/hadoop-namenode/Dockerfile b/hadoop-namenode/Dockerfile deleted file mode 100644 index f2711cf..0000000 --- a/hadoop-namenode/Dockerfile +++ /dev/null @@ -1,7 +0,0 @@ -FROM bde2020/hadoop-base - -MAINTAINER Yiannis Mouchakis - -EXPOSE 50070 8020 - -CMD hdfs namenode -format -nonInteractive & hdfs namenode \ No newline at end of file diff --git a/hadoop.env b/hadoop.env new file mode 100644 index 0000000..5e88669 --- /dev/null +++ b/hadoop.env @@ -0,0 +1,24 @@ +#GANGLIA_HOST=ganglia.hadoop + +CORE_CONF_fs_defaultFS=hdfs://namenode:8020 +CORE_CONF_hadoop_http_staticuser_user=root +CORE_CONF_hadoop_proxyuser_hue_hosts=* +CORE_CONF_hadoop_proxyuser_hue_groups=* + +HDFS_CONF_dfs_webhdfs_enabled=true +HDFS_CONF_dfs_permissions_enabled=false + +YARN_CONF_yarn_log___aggregation___enable=true +YARN_CONF_yarn_resourcemanager_recovery_enabled=true +YARN_CONF_yarn_resourcemanager_store_class=org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore +YARN_CONF_yarn_resourcemanager_fs_state___store_uri=/rmstate +YARN_CONF_yarn_nodemanager_remote___app___log___dir=/app-logs + +YARN_CONF_yarn_log_server_url=http://historyserver.hadoop:8188/applicationhistory/logs/ +YARN_CONF_yarn_timeline___service_enabled=true +YARN_CONF_yarn_timeline___service_generic___application___history_enabled=true +YARN_CONF_yarn_resourcemanager_system___metrics___publisher_enabled=true + + +YARN_CONF_yarn_resourcemanager_hostname=resourcemanager +YARN_CONF_yarn_timeline___service_hostname=historyserver.hadoop diff --git a/historyserver/Dockerfile b/historyserver/Dockerfile new file mode 100644 index 0000000..358fad8 --- /dev/null +++ b/historyserver/Dockerfile @@ -0,0 +1,11 @@ +FROM bde2020/hadoop-base:1.0.0 +MAINTAINER Ivan Ermilov + +ENV YARN_CONF_yarn_timeline___service_leveldb___timeline___store_path=/hadoop/yarn/timeline +RUN mkdir -p /hadoop/yarn/timeline +VOLUME /hadoop/yarn/timeline + +ADD run.sh /run.sh +RUN chmod a+x /run.sh + +CMD ["/run.sh"] diff --git a/historyserver/run.sh b/historyserver/run.sh new file mode 100644 index 0000000..1ce6633 --- /dev/null +++ b/historyserver/run.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +$HADOOP_PREFIX/bin/yarn --config $HADOOP_CONF_DIR historyserver diff --git a/namenode/Dockerfile b/namenode/Dockerfile new file mode 100644 index 0000000..ee947bf --- /dev/null +++ b/namenode/Dockerfile @@ -0,0 +1,11 @@ +FROM bde2020/hadoop-base:1.0.0 +MAINTAINER Ivan Ermilov + +ENV HDFS_CONF_dfs_namenode_name_dir=file:///hadoop/dfs/name +RUN mkdir -p /hadoop/dfs/name +VOLUME /hadoop/dfs/name + +ADD run.sh /run.sh +RUN chmod a+x /run.sh + +CMD ["/run.sh"] diff --git a/namenode/run.sh b/namenode/run.sh new file mode 100644 index 0000000..8ef0797 --- /dev/null +++ b/namenode/run.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +namedir=`echo $HDFS_CONF_dfs_namenode_name_dir | perl -pe 's#file://##'` +if [ ! -d $namedir ]; then + echo "Namenode name directory not found: $namedir" + exit 2 +fi + +if [ -z "$CLUSTER_NAME" ]; then + echo "Cluster name not specified" + exit 2 +fi + +if [ "`ls -A $namedir`" == "" ]; then + echo "Formatting namenode name directory: $namedir" + $HADOOP_PREFIX/bin/hdfs --config $HADOOP_CONF_DIR namenode -format $CLUSTER_NAME +fi + +$HADOOP_PREFIX/bin/hdfs --config $HADOOP_CONF_DIR namenode diff --git a/nodemanager/Dockerfile b/nodemanager/Dockerfile new file mode 100644 index 0000000..a026fd1 --- /dev/null +++ b/nodemanager/Dockerfile @@ -0,0 +1,7 @@ +FROM bde2020/hadoop-base:1.0.0 +MAINTAINER Ivan Ermilov + +ADD run.sh /run.sh +RUN chmod a+x /run.sh + +CMD ["/run.sh"] diff --git a/nodemanager/run.sh b/nodemanager/run.sh new file mode 100644 index 0000000..115bcdb --- /dev/null +++ b/nodemanager/run.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +$HADOOP_PREFIX/bin/yarn --config $HADOOP_CONF_DIR nodemanager diff --git a/resourcemanager/Dockerfile b/resourcemanager/Dockerfile new file mode 100644 index 0000000..a026fd1 --- /dev/null +++ b/resourcemanager/Dockerfile @@ -0,0 +1,7 @@ +FROM bde2020/hadoop-base:1.0.0 +MAINTAINER Ivan Ermilov + +ADD run.sh /run.sh +RUN chmod a+x /run.sh + +CMD ["/run.sh"] diff --git a/resourcemanager/run.sh b/resourcemanager/run.sh new file mode 100644 index 0000000..c1bdb94 --- /dev/null +++ b/resourcemanager/run.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +$HADOOP_PREFIX/bin/yarn --config $HADOOP_CONF_DIR resourcemanager