From a49ea784639bcedaee61172918627bc4ea2389c7 Mon Sep 17 00:00:00 2001 From: Satya Kommula Date: Mon, 17 Jan 2022 18:35:47 +0530 Subject: [PATCH 1/4] bump spark version "3.0.0" -> "3.2.0" --- build.sbt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build.sbt b/build.sbt index 2303e62d..68e31d3c 100644 --- a/build.sbt +++ b/build.sbt @@ -14,7 +14,7 @@ sparkPackageName := "databricks/spark-sql-perf" // All Spark Packages need a license licenses := Seq("Apache-2.0" -> url("http://opensource.org/licenses/Apache-2.0")) -sparkVersion := "3.0.0" +sparkVersion := "3.2.0" sparkComponents ++= Seq("sql", "hive", "mllib") From 229b5fff9078c8733d9301e58c5d71352904c2df Mon Sep 17 00:00:00 2001 From: Satya Kommula Date: Tue, 3 Dec 2024 15:49:48 +0530 Subject: [PATCH 2/4] Upgrade Spark to version 3.5.1, update dependencies, and replace the Bintray URL --- bin/run | 2 +- build.sbt | 6 +++--- build/sbt-launch-lib.bash | 17 +++++++++-------- project/plugins.sbt | 19 +++++++++++-------- .../databricks/spark/sql/perf/Benchmark.scala | 3 ++- .../spark/sql/perf/Benchmarkable.scala | 8 +++++--- .../com/databricks/spark/sql/perf/Query.scala | 3 ++- .../mllib/MLPipelineStageBenchmarkable.scala | 3 ++- version.sbt | 2 +- 9 files changed, 36 insertions(+), 27 deletions(-) diff --git a/bin/run b/bin/run index 7d28227c..f8923ffc 100755 --- a/bin/run +++ b/bin/run @@ -3,4 +3,4 @@ # runs spark-sql-perf from the current directory ARGS="runBenchmark $@" -build/sbt "$ARGS" \ No newline at end of file +sbt "$ARGS" \ No newline at end of file diff --git a/build.sbt b/build.sbt index 68e31d3c..1a2b09f2 100644 --- a/build.sbt +++ b/build.sbt @@ -5,16 +5,16 @@ name := "spark-sql-perf" organization := "com.databricks" -scalaVersion := "2.12.10" +scalaVersion := "2.12.18" -crossScalaVersions := Seq("2.12.10") +crossScalaVersions := Seq("2.12.18") sparkPackageName := "databricks/spark-sql-perf" // All Spark Packages need a license licenses := Seq("Apache-2.0" -> url("http://opensource.org/licenses/Apache-2.0")) -sparkVersion := "3.2.0" +sparkVersion := "3.5.1" sparkComponents ++= Seq("sql", "hive", "mllib") diff --git a/build/sbt-launch-lib.bash b/build/sbt-launch-lib.bash index 2a399365..707f70ef 100755 --- a/build/sbt-launch-lib.bash +++ b/build/sbt-launch-lib.bash @@ -45,9 +45,8 @@ dlog () { acquire_sbt_jar () { SBT_VERSION=`awk -F "=" '/sbt\.version/ {print $2}' ./project/build.properties` - URL1=https://dl.bintray.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar + URL1=https://github.com/sbt/sbt/releases/download/v${SBT_VERSION}/sbt-${SBT_VERSION}.zip JAR=build/sbt-launch-${SBT_VERSION}.jar - sbt_jar=$JAR if [[ ! -f "$sbt_jar" ]]; then @@ -55,13 +54,15 @@ acquire_sbt_jar () { if [ ! -f "${JAR}" ]; then # Download printf "Attempting to fetch sbt\n" - JAR_DL="${JAR}.part" + COMPLETE_SBT="build/sbt.zip" if [ $(command -v curl) ]; then - curl --fail --location --silent ${URL1} > "${JAR_DL}" &&\ - mv "${JAR_DL}" "${JAR}" + curl --fail --location --silent ${URL1} > "${COMPLETE_SBT}" &&\ + unzip ${COMPLETE_SBT} &&\ + cp "sbt/bin/sbt-launch.jar" "${JAR}" elif [ $(command -v wget) ]; then - wget --quiet ${URL1} -O "${JAR_DL}" &&\ - mv "${JAR_DL}" "${JAR}" + wget --quiet ${URL1} -O "${COMPLETE_SBT}" &&\ + unzip ${COMPLETE_SBT} &&\ + cp "sbt/bin/sbt-launch.jar" "${JAR}" else printf "You do not have curl or wget installed, please install sbt manually from http://www.scala-sbt.org/\n" exit -1 @@ -195,4 +196,4 @@ run() { -jar "$sbt_jar" \ "${sbt_commands[@]}" \ "${residual_args[@]}" -} +} \ No newline at end of file diff --git a/project/plugins.sbt b/project/plugins.sbt index d2473b61..c76851f6 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -1,17 +1,20 @@ // You may use this file to add plugin dependencies for sbt. -resolvers += "Spark Packages repo" at "https://repos.spark-packages.org/" +resolvers ++= Seq( + Resolver.mavenLocal, + Resolver.sonatypeRepo("releases"), + "Maven Central" at "https://repo1.maven.org/maven2/", + "Spark Packages Repo" at "https://repos.spark-packages.org/" +) -resolvers += "sonatype-releases" at "https://oss.sonatype.org/content/repositories/releases/" - -addSbtPlugin("org.spark-packages" %% "sbt-spark-package" % "0.1.1") +addSbtPlugin("org.spark-packages" % "sbt-spark-package" % "0.2.3") addSbtPlugin("com.github.mpeltonen" % "sbt-idea" % "1.6.0") -addSbtPlugin("com.github.gseitz" % "sbt-release" % "1.0.0") +addSbtPlugin("com.github.sbt" % "sbt-release" % "1.0.15") -addSbtPlugin("com.databricks" %% "sbt-databricks" % "0.1.3") +addSbtPlugin("com.databricks" %% "sbt-databricks" % "0.1.5") -addSbtPlugin("me.lessis" % "bintray-sbt" % "0.3.0") +addSbtPlugin("org.foundweekends" % "sbt-bintray" % "0.5.6") -addSbtPlugin("com.jsuereth" % "sbt-pgp" % "1.0.0") +addSbtPlugin("com.github.sbt" % "sbt-pgp" % "2.1.2") diff --git a/src/main/scala/com/databricks/spark/sql/perf/Benchmark.scala b/src/main/scala/com/databricks/spark/sql/perf/Benchmark.scala index ebb49353..6098f353 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/Benchmark.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/Benchmark.scala @@ -240,7 +240,8 @@ abstract class Benchmark( protected override def doBenchmark( includeBreakdown: Boolean, description: String = "", - messages: ArrayBuffer[String]): BenchmarkResult = { + messages: ArrayBuffer[String], + iteration: Int = 1): BenchmarkResult = { try { val timeMs = measureTimeMs(run()) BenchmarkResult( diff --git a/src/main/scala/com/databricks/spark/sql/perf/Benchmarkable.scala b/src/main/scala/com/databricks/spark/sql/perf/Benchmarkable.scala index 24efef70..b36850fc 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/Benchmarkable.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/Benchmarkable.scala @@ -43,14 +43,15 @@ trait Benchmarkable { description: String = "", messages: ArrayBuffer[String], timeout: Long, - forkThread: Boolean = true): BenchmarkResult = { + forkThread: Boolean = true, + iteration: Int = 1): BenchmarkResult = { logger.info(s"$this: benchmark") sparkContext.setJobDescription(s"Execution: $name, $description") beforeBenchmark() val result = if (forkThread) { runBenchmarkForked(includeBreakdown, description, messages, timeout) } else { - doBenchmark(includeBreakdown, description, messages) + doBenchmark(includeBreakdown, description, messages, iteration) } afterBenchmark(sqlContext.sparkContext) result @@ -107,7 +108,8 @@ trait Benchmarkable { protected def doBenchmark( includeBreakdown: Boolean, description: String = "", - messages: ArrayBuffer[String]): BenchmarkResult + messages: ArrayBuffer[String], + iteration: Int = 1): BenchmarkResult protected def measureTimeMs[A](f: => A): Double = { val startTime = System.nanoTime() diff --git a/src/main/scala/com/databricks/spark/sql/perf/Query.scala b/src/main/scala/com/databricks/spark/sql/perf/Query.scala index babc63f0..48c0e880 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/Query.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/Query.scala @@ -62,7 +62,8 @@ class Query( protected override def doBenchmark( includeBreakdown: Boolean, description: String = "", - messages: ArrayBuffer[String]): BenchmarkResult = { + messages: ArrayBuffer[String], + iteration: Int = 1): BenchmarkResult = { try { val dataFrame = buildDataFrame val queryExecution = dataFrame.queryExecution diff --git a/src/main/scala/com/databricks/spark/sql/perf/mllib/MLPipelineStageBenchmarkable.scala b/src/main/scala/com/databricks/spark/sql/perf/mllib/MLPipelineStageBenchmarkable.scala index 8296f46b..58b58919 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/mllib/MLPipelineStageBenchmarkable.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/mllib/MLPipelineStageBenchmarkable.scala @@ -45,7 +45,8 @@ class MLPipelineStageBenchmarkable( override protected def doBenchmark( includeBreakdown: Boolean, description: String, - messages: ArrayBuffer[String]): BenchmarkResult = { + messages: ArrayBuffer[String], + iteration: Int = 1): BenchmarkResult = { try { val (trainingTime, model: Transformer) = measureTime { logger.info(s"$this: train: trainingSet=${trainingData.schema}") diff --git a/version.sbt b/version.sbt index 7338ce76..f9436171 100644 --- a/version.sbt +++ b/version.sbt @@ -1 +1 @@ -version in ThisBuild := "0.5.1-SNAPSHOT" +version in ThisBuild := "0.5.2-SNAPSHOT" From 9b8d6531dab1044148421d78d733488a33c3064d Mon Sep 17 00:00:00 2001 From: Satya Kommula Date: Tue, 3 Dec 2024 17:24:42 +0530 Subject: [PATCH 3/4] initial commit for github workflows --- .github/workflows/scala.yml | 58 +++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 .github/workflows/scala.yml diff --git a/.github/workflows/scala.yml b/.github/workflows/scala.yml new file mode 100644 index 00000000..c1d767bc --- /dev/null +++ b/.github/workflows/scala.yml @@ -0,0 +1,58 @@ +name: Build Spark sql perf + +on: + push: + branches: + - master + pull_request: + branches: + - master + +jobs: + build: + runs-on: ubuntu-22.04 + + steps: + - name: Checkout code + uses: actions/checkout@v2 + + - name: Set up JDK 11 + uses: actions/setup-java@v3 + with: + java-version: '11' + distribution: 'adopt' + + - name: Install SDKMAN! and sbt 0.13.18 + run: | + curl -s "https://get.sdkman.io" | bash + source "$HOME/.sdkman/bin/sdkman-init.sh" + sdk install sbt 0.13.18 + + - name: Cache sbt + uses: actions/cache@v4 + with: + path: | + ~/.ivy2/cache + ~/.sbt + ~/.coursier + key: ${{ runner.os }}-sbt-${{ hashFiles('**/build.sbt') }} + restore-keys: | + ${{ runner.os }}-sbt- + + - name: Build with sbt + run: sbt compile + + - name: Package with sbt + run: sbt package + + - name: Extract version + id: extract_version + run: | + version=$(cat version.sbt | grep 'version in ThisBuild :=' | awk -F'\"' '{print $2}') + echo "version=$version" >> $GITHUB_ENV + + - name: Upload JAR artifact + uses: actions/upload-artifact@v4 + with: + name: spark-sql-perf_2.12-${{ env.version }}.jar + path: target/scala-2.12/*.jar \ No newline at end of file From dcfeaf1a18c0bb294c324d2d58bedaa4d4675043 Mon Sep 17 00:00:00 2001 From: Satya Kommula Date: Tue, 25 Nov 2025 13:20:47 +0530 Subject: [PATCH 4/4] Update build configuration and dependencies; remove incompatible plugins and Databricks settings (#3) --- .github/workflows/scala.yml | 10 +-- .gitignore | 2 + build.sbt | 62 ++++++++++--------- build/sbt | 2 +- project/build.properties | 3 +- project/plugins.sbt | 17 +++-- .../databricks/spark/sql/perf/Benchmark.scala | 2 +- .../spark/sql/perf/Benchmarkable.scala | 2 +- .../spark/sql/perf/DatasetPerformance.scala | 2 +- .../com/databricks/spark/sql/perf/Query.scala | 2 +- version.sbt | 2 +- 11 files changed, 51 insertions(+), 55 deletions(-) diff --git a/.github/workflows/scala.yml b/.github/workflows/scala.yml index c1d767bc..cb480db0 100644 --- a/.github/workflows/scala.yml +++ b/.github/workflows/scala.yml @@ -22,12 +22,6 @@ jobs: java-version: '11' distribution: 'adopt' - - name: Install SDKMAN! and sbt 0.13.18 - run: | - curl -s "https://get.sdkman.io" | bash - source "$HOME/.sdkman/bin/sdkman-init.sh" - sdk install sbt 0.13.18 - - name: Cache sbt uses: actions/cache@v4 with: @@ -40,10 +34,10 @@ jobs: ${{ runner.os }}-sbt- - name: Build with sbt - run: sbt compile + run: ./build/sbt compile - name: Package with sbt - run: sbt package + run: ./build/sbt package - name: Extract version id: extract_version diff --git a/.gitignore b/.gitignore index 1bcb62a0..fec77466 100644 --- a/.gitignore +++ b/.gitignore @@ -19,3 +19,5 @@ src_managed/ project/boot/ project/plugins/project/ performance/ +/.bloop/ +/build/*.zip diff --git a/build.sbt b/build.sbt index 1a2b09f2..ba97caa9 100644 --- a/build.sbt +++ b/build.sbt @@ -9,61 +9,60 @@ scalaVersion := "2.12.18" crossScalaVersions := Seq("2.12.18") -sparkPackageName := "databricks/spark-sql-perf" +// Remove publishing configuration for now - focus on compilation +// sparkPackageName := "databricks/spark-sql-perf" // All Spark Packages need a license licenses := Seq("Apache-2.0" -> url("http://opensource.org/licenses/Apache-2.0")) -sparkVersion := "3.5.1" +// Spark version - define it manually since we removed the spark-packages plugin +val sparkVersion = "3.5.1" -sparkComponents ++= Seq("sql", "hive", "mllib") +// Add Spark dependencies manually +libraryDependencies ++= Seq( + "org.apache.spark" %% "spark-core" % sparkVersion % "provided", + "org.apache.spark" %% "spark-sql" % sparkVersion % "provided", + "org.apache.spark" %% "spark-hive" % sparkVersion % "provided", + "org.apache.spark" %% "spark-mllib" % sparkVersion % "provided" +) -initialCommands in console := +initialCommands / console := """ |import org.apache.spark.sql._ |import org.apache.spark.sql.functions._ |import org.apache.spark.sql.types._ - |import org.apache.spark.sql.hive.test.TestHive - |import TestHive.implicits - |import TestHive.sql + |import org.apache.spark.sql.SparkSession | - |val sqlContext = TestHive + |val spark = SparkSession.builder().appName("spark-sql-perf").getOrCreate() + |val sqlContext = spark.sqlContext |import sqlContext.implicits._ """.stripMargin -libraryDependencies += "com.github.scopt" %% "scopt" % "3.7.1" +libraryDependencies += "com.github.scopt" %% "scopt" % "4.1.0" -libraryDependencies += "com.twitter" %% "util-jvm" % "6.45.0" % "provided" +libraryDependencies += "com.twitter" %% "util-jvm" % "24.2.0" % "provided" -libraryDependencies += "org.scalatest" %% "scalatest" % "3.0.5" % "test" +libraryDependencies += "org.scalatest" %% "scalatest" % "3.2.19" % "test" -libraryDependencies += "org.yaml" % "snakeyaml" % "1.23" +libraryDependencies += "org.yaml" % "snakeyaml" % "2.5" fork := true -// Your username to login to Databricks Cloud -dbcUsername := sys.env.getOrElse("DBC_USERNAME", "") - -// Your password (Can be set as an environment variable) -dbcPassword := sys.env.getOrElse("DBC_PASSWORD", "") - -// The URL to the Databricks Cloud DB Api. Don't forget to set the port number to 34563! -dbcApiUrl := sys.env.getOrElse ("DBC_URL", sys.error("Please set DBC_URL")) - -// Add any clusters that you would like to deploy your work to. e.g. "My Cluster" -// or run dbcExecuteCommand -dbcClusters += sys.env.getOrElse("DBC_USERNAME", "") - -dbcLibraryPath := s"/Users/${sys.env.getOrElse("DBC_USERNAME", "")}/lib" +// Remove Databricks Cloud configuration for now +// dbcUsername := sys.env.getOrElse("DBC_USERNAME", "") +// dbcPassword := sys.env.getOrElse("DBC_PASSWORD", "") +// dbcApiUrl := sys.env.getOrElse ("DBC_URL", sys.error("Please set DBC_URL")) +// dbcClusters += sys.env.getOrElse("DBC_USERNAME", "") +// dbcLibraryPath := s"/Users/${sys.env.getOrElse("DBC_USERNAME", "")}/lib" val runBenchmark = inputKey[Unit]("runs a benchmark") runBenchmark := { import complete.DefaultParsers._ val args = spaceDelimited("[args]").parsed - val scalaRun = (runner in run).value - val classpath = (fullClasspath in Compile).value + val scalaRun = (Compile / run / runner).value + val classpath = (Compile / fullClasspath).value scalaRun.run("com.databricks.spark.sql.perf.RunBenchmark", classpath.map(_.data), args, streams.value.log) } @@ -74,13 +73,15 @@ val runMLBenchmark = inputKey[Unit]("runs an ML benchmark") runMLBenchmark := { import complete.DefaultParsers._ val args = spaceDelimited("[args]").parsed - val scalaRun = (runner in run).value - val classpath = (fullClasspath in Compile).value + val scalaRun = (Compile / run / runner).value + val classpath = (Compile / fullClasspath).value scalaRun.run("com.databricks.spark.sql.perf.mllib.MLLib", classpath.map(_.data), args, streams.value.log) } +// Comment out release configuration for now +/* import ReleaseTransformations._ /** Push to the team directory instead of the user's homedir for releases. */ @@ -159,3 +160,4 @@ releaseProcess := Seq[ReleaseStep]( commitNextVersion, pushChanges ) +*/ \ No newline at end of file diff --git a/build/sbt b/build/sbt index cc3203d7..7d26b548 100755 --- a/build/sbt +++ b/build/sbt @@ -153,4 +153,4 @@ trap onExit INT run "$@" exit_status=$? -onExit +onExit \ No newline at end of file diff --git a/project/build.properties b/project/build.properties index 5c4bcd91..e88a0d81 100644 --- a/project/build.properties +++ b/project/build.properties @@ -1,2 +1 @@ -// This file should only contain the version of sbt to use. -sbt.version=0.13.18 +sbt.version=1.10.6 diff --git a/project/plugins.sbt b/project/plugins.sbt index c76851f6..1b633aee 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -1,20 +1,19 @@ // You may use this file to add plugin dependencies for sbt. resolvers ++= Seq( - Resolver.mavenLocal, - Resolver.sonatypeRepo("releases"), - "Maven Central" at "https://repo1.maven.org/maven2/", + "sonatype-releases" at "https://oss.sonatype.org/content/repositories/releases/", "Spark Packages Repo" at "https://repos.spark-packages.org/" ) -addSbtPlugin("org.spark-packages" % "sbt-spark-package" % "0.2.3") +// Remove incompatible plugins for now +// addSbtPlugin("org.spark-packages" % "sbt-spark-package" % "0.2.3") -addSbtPlugin("com.github.mpeltonen" % "sbt-idea" % "1.6.0") +// addSbtPlugin("com.github.mpeltonen" % "sbt-idea" % "1.6.0") -addSbtPlugin("com.github.sbt" % "sbt-release" % "1.0.15") +// addSbtPlugin("com.github.sbt" % "sbt-release" % "1.0.15") -addSbtPlugin("com.databricks" %% "sbt-databricks" % "0.1.5") +// addSbtPlugin("com.databricks" %% "sbt-databricks" % "0.1.5") -addSbtPlugin("org.foundweekends" % "sbt-bintray" % "0.5.6") +// addSbtPlugin("org.foundweekends" % "sbt-bintray" % "0.5.6") -addSbtPlugin("com.github.sbt" % "sbt-pgp" % "2.1.2") +// addSbtPlugin("com.github.sbt" % "sbt-pgp" % "2.1.2") diff --git a/src/main/scala/com/databricks/spark/sql/perf/Benchmark.scala b/src/main/scala/com/databricks/spark/sql/perf/Benchmark.scala index 6098f353..e214dff1 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/Benchmark.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/Benchmark.scala @@ -212,7 +212,7 @@ abstract class Benchmark( new SparkPerfExecution( name, Map.empty, - () => Unit, + () => (), () => rdd.count(), rdd.toDebugString) } diff --git a/src/main/scala/com/databricks/spark/sql/perf/Benchmarkable.scala b/src/main/scala/com/databricks/spark/sql/perf/Benchmarkable.scala index b36850fc..6acb520a 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/Benchmarkable.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/Benchmarkable.scala @@ -86,7 +86,7 @@ trait Benchmarkable { mode = executionMode.toString, parameters = Map.empty, failure = Some(Failure(e.getClass.getSimpleName, - e.getMessage + ":\n" + e.getStackTraceString))) + e.getMessage + ":\n" + e.getStackTrace.mkString("\n")))) } } } diff --git a/src/main/scala/com/databricks/spark/sql/perf/DatasetPerformance.scala b/src/main/scala/com/databricks/spark/sql/perf/DatasetPerformance.scala index 0aaa6296..b3d25d44 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/DatasetPerformance.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/DatasetPerformance.scala @@ -133,7 +133,7 @@ class DatasetPerformance extends Benchmark { new SparkPerfExecution( "RDD: average", Map.empty, - prepare = () => Unit, + prepare = () => (), run = () => { val sumAndCount = smallrdd.map(i => (i, 1)).reduce((a, b) => (a._1 + b._1, a._2 + b._2)) diff --git a/src/main/scala/com/databricks/spark/sql/perf/Query.scala b/src/main/scala/com/databricks/spark/sql/perf/Query.scala index 48c0e880..c694225e 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/Query.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/Query.scala @@ -93,7 +93,7 @@ class Query( messages += s"Breakdown: ${node.simpleString(maxFields)}" val newNode = buildDataFrame.queryExecution.executedPlan.p(index) val executionTime = measureTimeMs { - newNode.execute().foreach((row: Any) => Unit) + newNode.execute().foreach((row: Any) => ()) } timeMap += ((index, executionTime)) diff --git a/version.sbt b/version.sbt index f9436171..f13c2095 100644 --- a/version.sbt +++ b/version.sbt @@ -1 +1 @@ -version in ThisBuild := "0.5.2-SNAPSHOT" +ThisBuild / version := "0.5.2-SNAPSHOT"