Pacific-Design.com

    
Home Index

1. Apache Hive

2. Query Join

Apache Hive / Query Join /

Hive - Join Two Tables

DROP TABLE rank;
DROP TABLE visit;
DROP TABLE rank_visit_join;
SET mapred.reduce.tasks = 60; 

-- ------------------------------------------------------------------------- --
CREATE EXTERNAL TABLE rank (
    pageRank        INT,
    pageURL         STRING,
    avgDuration     INT 
) 
ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' 
STORED AS TEXTFILE 
LOCATION 's3n://kduraj/data/rank';

-- ------------------------------------------------------------------------- --
CREATE EXTERNAL TABLE visit (
    source_ip       STRING,
    dest_url        STRING,
    visit_date      STRING,
    ad_revenue      DOUBLE,
    user_agent      STRING,
    country_code    STRING,
    language_code   STRING,
    search_word     STRING,
    duration        INT 
) 
ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' 
STORED AS TEXTFILE 
LOCATION 's3n://kduraj/data/visit/';
 
-- ------------------------------------------------------------------------- --
CREATE TABLE rank_visit_join (
    source_ip       STRING,
    avg_page_rank   DOUBLE,
    total_revenue   DOUBLE
);

-- ------------------------------------------------------------------------- --
INSERT OVERWRITE TABLE rank_visit_join
SELECT source_ip, AVG(pageRank), SUM(ad_revenue) AS total_revenue
FROM rank R
  JOIN (
        SELECT source_ip, dest_url, ad_revenue
        FROM visit UV
        WHERE UV.visit_date > '1999-01-01'
        AND   UV.visit_date < '2000-01-01') NUV ON (R.pageURL = NUV.dest_url)
GROUP BY source_ip
ORDER BY total_revenue DESC LIMIT 10;