Pacific-Design.com

    
Home Index

1. Apache Spark

2. Join

Apache Spark / Join /

Spark Join

val spark: SparkSession = ...
spark.sql("select * from t1, t2 where t1.id = t2.id")

df1.join(df2, $"df1Key" === $"df2Key")
df1.join(df2).where($"df1Key" === $"df2Key")


val data1 = sc.textFile("table1.csv")
val rdd1 = data.map(line => line.split(","))
val table1 = rdd1.map( col => col(0), col(1) ).toDF()

val data2 = sc.textFile("table2.csv")
val rdd2 = data.map(line => line.split(","))
val table2 = rdd2.map( col => col(0), col(1) ).toDF()


val joinedDF = table1.join(
     table2, col("table1._1" === col("table2._2") ),  "inner"
)

joinedDF.saveParquet("hdfs://output")


import org.apache.spark._
import org.apache.spark.sql._
import org.apache.spark.sql.expressions._
import org.apache.spark.sql.functions._

object SparkSandbox extends App {

  case class Row(id: Int, value: String)

  private[this] implicit val spark = SparkSession.builder().master("local[*]").getOrCreate()
  import spark.implicits._
  spark.sparkContext.setLogLevel("ERROR")

  val r1 = Seq(Row(1, "A1"), Row(2, "A2"), Row(3, "A3"), Row(4, "A4")).toDS()
  val r2 = Seq(Row(3, "A3"), Row(4, "A4"), Row(4, "A4_1"), Row(5, "A5"), Row(6, "A6")).toDS()

  val joinTypes = Seq("inner", "outer", "full", "full_outer", "left"
                    , "left_outer", "right", "right_outer", "left_semi", "left_anti")

  joinTypes foreach {joinType =>
    println(s"${joinType.toUpperCase()} JOIN")
    r1.join(right = r2, usingColumns = Seq("id"), joinType = joinType).orderBy("id").show()
  }
}

Output

INNER JOIN +---+-----+-----+ | id|value|value| +---+-----+-----+ | 3| A3| A3| | 4| A4| A4_1| | 4| A4| A4| +---+-----+-----+ OUTER JOIN +---+-----+-----+ | id|value|value| +---+-----+-----+ | 1| A1| null| | 2| A2| null| | 3| A3| A3| | 4| A4| A4| | 4| A4| A4_1| | 5| null| A5| | 6| null| A6| +---+-----+-----+ FULL JOIN +---+-----+-----+ | id|value|value| +---+-----+-----+ | 1| A1| null| | 2| A2| null| | 3| A3| A3| | 4| A4| A4_1| | 4| A4| A4| | 5| null| A5| | 6| null| A6| +---+-----+-----+