From 3702608eb9568bce36df2c645c9cd6ba9fc9bc2f Mon Sep 17 00:00:00 2001 From: tennix Date: Mon, 16 Jul 2018 22:30:33 +0800 Subject: [PATCH 1/4] update tispark to 1.0 --- README.md | 9 +++++++++ tispark/Dockerfile | 23 +++++++++++++++++++---- 2 files changed, 28 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 22cf159..2a987e5 100644 --- a/README.md +++ b/README.md @@ -180,3 +180,12 @@ scala> spark.sql("select count(*) from lineitem").show | 60175| +--------+ ``` + +You can also access Spark with Python or R using following commands: + +``` +docker-compose exec tispark-master /opt/spark/bin/pyspark +docker-compose exec tispark-master /opt/spark/bin/sparkR +``` + +More document about TiSpark can be found [here](https://github.com/pingcap/tispark). diff --git a/tispark/Dockerfile b/tispark/Dockerfile index 750e377..62c6758 100644 --- a/tispark/Dockerfile +++ b/tispark/Dockerfile @@ -2,25 +2,40 @@ FROM anapsix/alpine-java:8 ENV SPARK_VERSION=2.1.1 \ HADOOP_VERSION=2.7 \ - TISPARK_VERSION=0.1.0-SNAPSHOT \ + TISPARK_VERSION=1.0 \ + TISPARK_R_VERSION=1.1 \ + TISPARK_PYTHON_VERSION=1.0.1 \ SPARK_HOME=/opt/spark \ SPARK_NO_DAEMONIZE=true \ SPARK_MASTER_PORT=7077 \ SPARK_MASTER_HOST=0.0.0.0 \ SPARK_MASTER_WEBUI_PORT=8080 +ADD R /TiSparkR + # base image only contains busybox version nohup and ps # spark scripts needs nohup in coreutils and ps in procps # and we can use mysql-client to test tidb connection -RUN apk --no-cache add coreutils procps mysql-client python py-pip R \ - && pip install pytispark==1.0.1 pyspark==2.1.2 +RUN apk --no-cache add \ + coreutils \ + mysql-client \ + procps \ + python \ + py-pip \ + R \ + && pip install --no-cache-dir pytispark==${TISPARK_PYTHON_VERSION} \ + && R CMD build TiSparkR \ + && R CMD INSTALL TiSparkR_${TISPARK_R_VERSION}.tar.gz \ + && rm -rf /TiSparkR_${TISPARK_R_VERSION}.tar.gz /TiSparkR RUN wget -q https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz \ && tar zxf spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz -C /opt/ \ && ln -s /opt/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION} ${SPARK_HOME} \ - && wget -q http://download.pingcap.org/tispark-${TISPARK_VERSION}-jar-with-dependencies.jar -P ${SPARK_HOME}/jars \ + && wget -q https://github.com/pingcap/tispark/releases/download/${TISPARK_VERSION}/tispark-core-${TISPARK_VERSION}-jar-with-dependencies.jar -P ${SPARK_HOME}/jars \ && wget -q http://download.pingcap.org/tispark-sample-data.tar.gz \ && tar zxf tispark-sample-data.tar.gz -C ${SPARK_HOME}/data/ \ && rm -rf spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz tispark-sample-data.tar.gz +ENV PYTHONPATH=${SPARK_HOME}/python/lib/py4j-0.10.4-src.zip:${SPARK_HOME}/python:$PYTHONPATH + WORKDIR ${SPARK_HOME} From 4bb5b406bf3b906d77423d240294289f167f416f Mon Sep 17 00:00:00 2001 From: tennix Date: Mon, 16 Jul 2018 22:32:50 +0800 Subject: [PATCH 2/4] add TiSparkR --- tispark/R/DESCRIPTION | 11 +++++++++++ tispark/R/NAMESPACE | 1 + tispark/R/R/tisparkR.R | 41 +++++++++++++++++++++++++++++++++++++++++ tispark/R/README.md | 42 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 95 insertions(+) create mode 100644 tispark/R/DESCRIPTION create mode 100644 tispark/R/NAMESPACE create mode 100644 tispark/R/R/tisparkR.R create mode 100644 tispark/R/README.md diff --git a/tispark/R/DESCRIPTION b/tispark/R/DESCRIPTION new file mode 100644 index 0000000..fbfbe62 --- /dev/null +++ b/tispark/R/DESCRIPTION @@ -0,0 +1,11 @@ +Package: TiSparkR +Type: Package +Title: TiSpark for R +Version: 1.1 +Author: PingCAP +Maintainer: Novemser +Description: A shabby thin layer to support TiSpark in R language. +License: Apache 2.0 +Copyright: 2017 PingCAP, Inc. +Encoding: UTF-8 +LazyData: true diff --git a/tispark/R/NAMESPACE b/tispark/R/NAMESPACE new file mode 100644 index 0000000..d75f824 --- /dev/null +++ b/tispark/R/NAMESPACE @@ -0,0 +1 @@ +exportPattern("^[[:alpha:]]+") diff --git a/tispark/R/R/tisparkR.R b/tispark/R/R/tisparkR.R new file mode 100644 index 0000000..ac7a272 --- /dev/null +++ b/tispark/R/R/tisparkR.R @@ -0,0 +1,41 @@ +# +# Copyright 2017 PingCAP, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# See the License for the specific language governing permissions and +# limitations under the License. +# +# + +# Title : TiSparkR +# Objective : TiSpark entry for R +# Created by: novemser +# Created on: 17-11-1 + +# Function:createTiContext +# Create a new TiContext via the spark session passed in +# +# @return A new TiContext created on session +# @param session A Spark Session for TiContext creation +createTiContext <- function(session) { + sparkR.newJObject("org.apache.spark.sql.TiContext", session) +} + +# Function:tidbMapDatabase +# Mapping TiContext designated database to `dbName`. +# +# @param tiContext TiSpark context +# @param dbName Database name to map +# @param isPrefix Whether to use dbName As Prefix +# @param loadStatistics Whether to use statistics information from TiDB +tidbMapDatabase <- function(tiContext, dbName, isPrefix=FALSE, loadStatistics=TRUE) { + sparkR.callJMethod(tiContext, "tidbMapDatabase", dbName, isPrefix, loadStatistics) + paste("Mapping to database:", dbName) +} diff --git a/tispark/R/README.md b/tispark/R/README.md new file mode 100644 index 0000000..bfffdee --- /dev/null +++ b/tispark/R/README.md @@ -0,0 +1,42 @@ +## TiSparkR +A thin layer build for supporting R language with TiSpark + +### Usage +1. Download TiSparkR source code and build a binary package(run `R CMD build R` in TiSpark root directory). Install it to your local R library(e.g. via `R CMD INSTALL TiSparkR_1.0.0.tar.gz`) +2. Build or download TiSpark dependency jar `tispark-core-1.0-RC1-jar-with-dependencies.jar` [here](https://github.com/pingcap/tispark). +3. `cd` to your Spark home directory, and run +``` +./bin/sparkR --jars /where-ever-it-is/tispark-core-${version}-jar-with-dependencies.jar +``` +Note that you should replace the `TiSpark` jar path with your own. + +4. Use as below in your R console: +```R +# import tisparkR library +> library(TiSparkR) +# create a TiContext instance +> ti <- createTiContext(spark) +# Map TiContext to database:tpch_test +> tidbMapDatabase(ti, "tpch_test") + +# Run a sql query +> customers <- sql("select * from customer") +# Print schema +> printSchema(customers) +root + |-- c_custkey: long (nullable = true) + |-- c_name: string (nullable = true) + |-- c_address: string (nullable = true) + |-- c_nationkey: long (nullable = true) + |-- c_phone: string (nullable = true) + |-- c_acctbal: decimal(15,2) (nullable = true) + |-- c_mktsegment: string (nullable = true) + |-- c_comment: string (nullable = true) + +# Run a count query +> count <- sql("select count(*) from customer") +# Print count result +> head(count) + count(1) +1 150 +``` \ No newline at end of file From cf8eeb48385840d6856702a8d5786b64d3e16378 Mon Sep 17 00:00:00 2001 From: tennix Date: Tue, 17 Jul 2018 16:19:10 +0800 Subject: [PATCH 3/4] address comment --- README.md | 4 +-- tispark/R/README.md | 74 +++++++++++++++++++++++---------------------- 2 files changed, 40 insertions(+), 38 deletions(-) diff --git a/README.md b/README.md index 2a987e5..f557dbe 100644 --- a/README.md +++ b/README.md @@ -181,11 +181,11 @@ scala> spark.sql("select count(*) from lineitem").show +--------+ ``` -You can also access Spark with Python or R using following commands: +You can also access Spark with Python or R using the following commands: ``` docker-compose exec tispark-master /opt/spark/bin/pyspark docker-compose exec tispark-master /opt/spark/bin/sparkR ``` -More document about TiSpark can be found [here](https://github.com/pingcap/tispark). +More documents about TiSpark can be found [here](https://github.com/pingcap/tispark). diff --git a/tispark/R/README.md b/tispark/R/README.md index bfffdee..fab5d3f 100644 --- a/tispark/R/README.md +++ b/tispark/R/README.md @@ -1,42 +1,44 @@ ## TiSparkR -A thin layer build for supporting R language with TiSpark +TiSparkR is a thin layer built to support the R language with TiSpark. ### Usage -1. Download TiSparkR source code and build a binary package(run `R CMD build R` in TiSpark root directory). Install it to your local R library(e.g. via `R CMD INSTALL TiSparkR_1.0.0.tar.gz`) +1. Download the TiSparkR source code and build a binary package (run `R CMD build R` in TiSpark root directory). Install it to your local R library (e.g. via `R CMD INSTALL TiSparkR_1.0.0.tar.gz`) + 2. Build or download TiSpark dependency jar `tispark-core-1.0-RC1-jar-with-dependencies.jar` [here](https://github.com/pingcap/tispark). -3. `cd` to your Spark home directory, and run -``` -./bin/sparkR --jars /where-ever-it-is/tispark-core-${version}-jar-with-dependencies.jar -``` -Note that you should replace the `TiSpark` jar path with your own. - + +3. `cd` to your Spark home directory, and run: + ``` + ./bin/sparkR --jars /where-ever-it-is/tispark-core-${version}-jar-with-dependencies.jar + ``` + Note that you should replace the `TiSpark` jar path with your own. + 4. Use as below in your R console: -```R -# import tisparkR library -> library(TiSparkR) -# create a TiContext instance -> ti <- createTiContext(spark) -# Map TiContext to database:tpch_test -> tidbMapDatabase(ti, "tpch_test") + ```R + # import tisparkR library + > library(TiSparkR) + # create a TiContext instance + > ti <- createTiContext(spark) + # Map TiContext to database:tpch_test + > tidbMapDatabase(ti, "tpch_test") + + # Run a sql query + > customers <- sql("select * from customer") + # Print schema + > printSchema(customers) + root + |-- c_custkey: long (nullable = true) + |-- c_name: string (nullable = true) + |-- c_address: string (nullable = true) + |-- c_nationkey: long (nullable = true) + |-- c_phone: string (nullable = true) + |-- c_acctbal: decimal(15,2) (nullable = true) + |-- c_mktsegment: string (nullable = true) + |-- c_comment: string (nullable = true) -# Run a sql query -> customers <- sql("select * from customer") -# Print schema -> printSchema(customers) -root - |-- c_custkey: long (nullable = true) - |-- c_name: string (nullable = true) - |-- c_address: string (nullable = true) - |-- c_nationkey: long (nullable = true) - |-- c_phone: string (nullable = true) - |-- c_acctbal: decimal(15,2) (nullable = true) - |-- c_mktsegment: string (nullable = true) - |-- c_comment: string (nullable = true) - -# Run a count query -> count <- sql("select count(*) from customer") -# Print count result -> head(count) - count(1) -1 150 -``` \ No newline at end of file + # Run a count query + > count <- sql("select count(*) from customer") + # Print count result + > head(count) + count(1) + 1 150 + ``` From 8313e38d8483e7bf1ff737a2eb6737ea98c40aa5 Mon Sep 17 00:00:00 2001 From: tennix Date: Mon, 23 Jul 2018 17:31:46 +0800 Subject: [PATCH 4/4] upgrade tispark version to 1.0.1 --- tispark/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tispark/Dockerfile b/tispark/Dockerfile index 62c6758..8271564 100644 --- a/tispark/Dockerfile +++ b/tispark/Dockerfile @@ -2,7 +2,7 @@ FROM anapsix/alpine-java:8 ENV SPARK_VERSION=2.1.1 \ HADOOP_VERSION=2.7 \ - TISPARK_VERSION=1.0 \ + TISPARK_VERSION=1.0.1 \ TISPARK_R_VERSION=1.1 \ TISPARK_PYTHON_VERSION=1.0.1 \ SPARK_HOME=/opt/spark \