Pacific-Design.com

    
Home Index

1. Apache Spark

2. Cassandra CQL

+ INSERT SELECT

+ JOIN TABLES

Apache Spark / Cassandra CQL /

Spark context available as sc.
Hive context available as hc.
CassandraSQLContext available as csc.

(GROUP BY) and (HAVING) using ARRAY[ROW]


scala> csc.sql("SELECT b, count(*) FROM test1 GROUP BY b HAVING count(b) > 1").collect().foreach(println)
[1,4]
[2,4]

(GROUP BY) and (HAVING) using DataFrame


scala> csc.sql("SELECT b, count(*) FROM test1 GROUP BY b HAVING count(b) > 1").show()
+-+--+
|b|c1|
+-+--+
|1| 4|
|2| 4|
+-+--+


import org.apache.spark.sql.cassandra.CassandraSQLContext

val rdd = csc.sql("SELECT * from domain.table1")

rdd.take(10).foreach(println)


scala> csc.sql("SELECT * FROM domain.table1 LIMIT 100")
res5: org.apache.spark.sql.DataFrame = [root: string, count: bigint]

scala> res5.filter(res5("count") > 100).show()
+-------------------+-----+
|               root|count|
+-------------------+-----+
|        aptoide.com|  114|
|propertykhazana.com|  908|
+-------------------+-----+

UNION DISTINCT


csc.sql("SELECT test1.a FROM sql_test.test1 AS test1 UNION DISTINCT SELECT test2.a FROM sql_test.test2 AS test2").collect()
res10: Array[org.apache.spark.sql.Row] = Array([1], [2], [3])

Insert into another table


csc.sql("INSERT INTO TABLE test3 SELECT a, b, c FROM test2").collect()
res30: Array[org.apache.spark.sql.Row] = Array([1,1,1], [1,2,3], [1,3,3], [2,1,3], [2,2,3], [2,3,3], [3,1,3], [3,2,3], [3,3,3])

Group By


scala> csc.sql("SELECT b, count(*) FROM test1 GROUP BY b").collect()
res16: Array[org.apache.spark.sql.Row] = Array([1,4], [2,4])

scala> res16.foreach(println)
[1,4]
[2,4]