Параллельные и распределенные вычисления

Семинар 10

Hive

Пономаренко Роман
@rerand0m
rerandom@ispras.ru

debates
Курс по SQL
Книга "SQL за 10 минут"

Запуск оболочки Hive

$ hive -e 'SHOW DATABASES'
$ echo 'SHOW DATABASES' > sh_db.sql
$ hive -f sh_db.sql
$ hive
hive> SHOW DATABASES;
hive> !nproc;
hive> dfs -ls;

HUE

$ ssh <USERNAME>@mipt-client.atp-fivt.org -L 8888:mipt-node03.atp-fivt.org:8888

Заходить через localhost:8888

Логин: hue_user

Пароль: hue_password

Создание БД

create database <YOUR_USER>_test location '/user/<YOUR_USER>/test_metastore';

Удаление БД

drop database if exists <YOUR_USER>_test cascade;

Вывод информации о БД

describe database <YOUR_USER>_test;

Создание таблицы

ADD JAR /opt/cloudera/parcels/CDH/lib/hive/lib/hive-contrib.jar;
USE ${env:USER}_test;
DROP TABLE IF EXISTS Subnets;

-- EXTERNAL, MANAGED, TEMPORARY
CREATE EXTERNAL TABLE Subnets (
    ip STRING,
    mask STRING
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY  '\t'
STORED AS TEXTFILE
LOCATION '/data/subnets/variant1';

Обращение

USE ${env:USER}_test;
SELECT * FROM Subnets LIMIT 10;

Партицирование

ADD JAR /opt/cloudera/parcels/CDH/lib/hive/lib/hive-contrib.jar;

SET hive.exec.dynamic.partition.mode=nonstrict;

USE ${env:USER}_test;
DROP TABLE IF EXISTS SubnetsPart;

CREATE EXTERNAL TABLE SubnetsPart (
ip STRING
)
PARTITIONED BY (mask STRING)
STORED AS TEXTFILE;

INSERT OVERWRITE TABLE SubnetsPart PARTITION (mask)
SELECT * FROM Subnets;

Парсинг входных данных

add jar /opt/cloudera/parcels/CDH/lib/hive/lib/hive-contrib.jar;
add jar /opt/cloudera/parcels/CDH/lib/hive/lib/hive-serde.jar;
USE ${env:USER}_test;
DROP TABLE IF EXISTS SerDeExample;

CREATE EXTERNAL TABLE SerDeExample (
ip STRING,
date STRING,
request STRING,
responseCode STRING
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
"input.regex" = '^(\\S*)\\t.*$'
)
STORED AS TEXTFILE
LOCATION '/data/user_logs/user_logs_S';

select * from SerDeExample limit 10;
					
regex101

Hive streaming

ADD JAR /opt/cloudera/parcels/CDH/lib/hive/lib/hive-contrib.jar;

USE ${env:USER}_test;

SELECT TRANSFORM(ip)
USING 'cut -d . -f 1' AS ip
FROM Subnets
LIMIT 10;

Hive streaming

ADD JAR /opt/cloudera/parcels/CDH/lib/hive/lib/hive-contrib.jar;
ADD FILE ./script.sh;

USE ${env:USER}_test;

SELECT TRANSFORM(ip)
USING './script.sh' AS ip2
FROM Subnets
LIMIT 10;