Pacific-Design.com

    
Home Index

1. Apache Spark

2. Cassandra RDD

Apache Spark / Cassandra RDD /


val table = sc.cassandraTable[(String, Int)]("domain", "table1")
/* table: com.datastax.spark.connector.rdd.CassandraTableScanRDD[(String, Int)] */

table.count()

table.take(10).foreach(println)

val res0 = table.collect().toList.sortBy{ - _._2 }

case class Row(url:String, count:Int)

val res1 = res0.map( r=> Row( r._1, r._2) )

val res2 = res1.filter(_.count > 10000)

res2.take(100).foreach(println)

Output

Row(academia.edu,27208)
Row(opendi.de,19394)
Row(opendi.at,18705)
Row(cbuch.de,18554)
Row(jiwu.com,17806)
Row(goodreads.com,15712)
Row(sterlingpublishers.com,15275)
Row(houzz.com,14781)
Row(lieju.com,14111)
Row(updatestar.com,12922)
Row(mobydiskrecords.es,11596)
Row(constructiondir.com,11011)
Row(vuodatus.net,10983)
Row(neoanuncios.com,10965)
Row(artistwebsites.com,10100)

SQL Sum


:showSchema

val table = sc.cassandraTable("domain", "table1")
val total = table.select("count").as((c: Int) => c).sum

SQL SELECT



val res0 = sc.cassandraTable("kevin", "user").select("id", "first_name").where("id = ?", "1744").toArray.foreach(println)