1.在github上下载Spark源码到本地:
git clone https://github.com/apache/spark.git
cd spark
git checkout v1.1.0 #取出1.1.0版本的saprk
2.修改pom.xml:
<protobuf.version>2.5.0</protobuf.version>
#修改为hadoop所使用的对应hadoop版本
3.编译:
./make-distribution.sh --tgz --skip-java-test -Dyarn.version=2.4.1
-Dhadoop.version=2.4.1 -Pyarn
#生成spark-1.1.0-bin.tgz文件
tar -zxvf spark-1.1.0-bin.tgz
mv spark-1.1.0-bin.tgz/* /usr/local/spark/
sudo adduser spark
chown -R spark.spark /usr/local/spark
4.添加环境变量:
export SPARK_HOME=/usr/local/spark
export
SPARK_HOME=$SPARK_HOME/lib/spark-assembly-1.0.2-hadoop2.4.1.jar
export PATH=$SPARK_HOME/bin:$PATH
5.修改conf目录下的spark-env.sh,添加环境变量:
export HADOOP_CONF=$HADOOP_INSTALL/etc/hadoop
export YARN_CONF_dIR=$HADOOP_INSTALL/etc/hadoop
6.测试:
spark-submit --class org.apache.spark.example.SparkPi --master \
yarn-cluster --num-executors 1 --driver-memory 1g --executor-memory
1g --executor-cores 1 lib/spark-examples*.jar 1
7.使用PySpark: 交互终端
pyspark --master yarn
Ipython Notebook 把下面代码加到~/.bashrc
function pyspark_yarn() {
PORT=$1
NOTEBOOK_DIR=$2
source /home/spark/pyenvs/py2.7/bin/activate
export IPYTHON=1
export IPYTHON_OPTS="notebook --ip=0.0.0.0 --port=$PORT
--notebook-dir=$NOTEBOOK_DIR --matplotlib=inline --no-browser"
$SPARK_HOME/bin/pyspark --master yarn --deploy-mode client $*
--driver-memory 1G --num-executors 2 --executor-cores 2
}
source ~/.bashrc
pyspark port work_directory
进入到http://localhost:port访问Ipython Notebook
tips: mvn download the dependency libs to the dir lib mvn dependency:copy-dependencies -DoutputDirectory=lib -DincludeScope=compile