Skip to main content

Lab: Simple Data Pipeline with HDInsight

Pre-requisites

  1. Azure account
  2. Gen2 Storage in azure
  3. HDInsight Hadoop Cluster in azure
  4. SQL server in azure

SQL Queries

SQL Server Table

CREATE TABLE [dbo].[delays](
[origin_city_name] [nvarchar](50) NOT NULL,
[weather_delay] float,
CONSTRAINT [PK_delays] PRIMARY KEY CLUSTERED
([origin_city_name] ASC))
GO

SELECT * FROM information_schema.tables
GO

Hive Tables and Query

DROP TABLE flightdelay_raw;

CREATE EXTERNAL TABLE flightdelay_raw(
YEAR string,
FL_DATE string,
UNIQUE_CARRIER string,
CARRIER string,
FL_NUM string,
ORIGIN_AIRPORT_ID string,
ORIGIN string,
ORIGIN_CITY_NAME string,
ORIGIN_CITY_NAME_TEMP string,
ORIGIN_STATE_ABR string,
DEST_AIRPORT_ID string,
DEST string,
DEST_CITY_NAME string,
DEST_CITY_NAME_TEMP string,
DEST_STATE_ABR string,
DEP_DELAY_NEW float,
ARR_DELAY_NEW float,
CARRIER_DELAY float,
WEATHER_DELAY float,
NAS_DELAY float,
SECURITY_DELAY float,
LATE_AIRCRAFT_DELAY float)
-- The following lines describe the format and location of the file
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n'
STORED AS TEXTFILE
LOCATION '/demo/FlightDelayData/InputData';


DROP TABLE flightdelay;

CREATE TABLE flightdelay AS
SELECT YEAR AS year,
FL_DATE AS flight_date,
substring(UNIQUE_CARRIER, 2, length(UNIQUE_CARRIER) -1) AS unique_carrier,
substring(CARRIER, 2, length(CARRIER) -1) AS carrier,
substring(FL_NUM, 2, length(FL_NUM) -1) AS flight_num,
ORIGIN_AIRPORT_ID AS origin_airport_id,
substring(ORIGIN, 2, length(ORIGIN) -1) AS origin_airport_code,
substring(ORIGIN_CITY_NAME, 2) AS origin_city_name,
substring(ORIGIN_STATE_ABR, 2, length(ORIGIN_STATE_ABR) -1) AS origin_state_abr,
DEST_AIRPORT_ID AS dest_airport_id,
substring(DEST, 2, length(DEST) -1) AS dest_airport_code,
substring(DEST_CITY_NAME,2) AS dest_city_name,
substring(DEST_STATE_ABR, 2, length(DEST_STATE_ABR) -1) AS dest_state_abr,
DEP_DELAY_NEW AS dep_delay_new,
ARR_DELAY_NEW AS arr_delay_new,
CARRIER_DELAY AS carrier_delay,
WEATHER_DELAY AS weather_delay,
NAS_DELAY AS nas_delay,
SECURITY_DELAY AS security_delay,
LATE_AIRCRAFT_DELAY AS late_aircraft_delay
FROM flightdelay_raw;

Hive Query to save data back to Gen 2 storage

INSERT OVERWRITE DIRECTORY '/tutorials/flightdelays/output'
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
SELECT regexp_replace(origin_city_name, '''', ''),
avg(weather_delay)
FROM delays
WHERE weather_delay IS NOT NULL
GROUP BY origin_city_name;

Sqoop commands

// SSH and Sqoop export
ssh eshantssh@clustername108-ssh.azurehdinsight.net

sqoop list-databases --connect jdbc:sqlserver://dbs108.database.windows.net:1433 --username eshant -P

sqoop export --connect 'jdbc:sqlserver://dbs108.database.windows.net:1433;database=db108' --username eshant -P --table 'delays' --export-dir '/demo/FlightDelayData/OutputData' --fields-terminated-by ',' -m 1