Pacific-Design.com

    
Home Index

1. Apache Spark

2. RDD to DF

Apache Spark / RDD to DF /

RDD to DataFrame


import csc.implicits._

val rdd = sc.parallelize(Array( ("Alice in Wonderland", 2010), ... ))
// rdd: org.apache.spark.rdd.RDD[(String, Int)]

val df  = rdd.toDF("title", "year")
// df: org.apache.spark.sql.DataFrame = [title: string, year: int]


import org.apache.spark.sql.Row
import org.apache.spark.sql.types._

val rdd = sc.parallelize(Array( ("Alice in Wonderland", 2010), ... ))
            .map{case(t,y) => Row(t,y)}
// rdd: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row]

val schema = StructType( List (
                 StructField("title", StringType,  false),
                 StructField("year",  IntegerType, false) ) )
                 // true = nullable, false = not nullable

val df = csc.createDataFrame(rdd, schema)
// df: org.apache.spark.sql.DataFrame = [title: string, year: int]