1. Build table
songsong,bingbing_lili,xiao song:18_xiaoxiao song:19,hui long guan_beijing yangyang,caicai_susu,xiao yang:18_xiaoxiao yang:19,chao yang_beijing
create table if not exists test( name string, friends array<string>, children map<string, int>, address struct<street:string, city:string> ) row format delimited fields terminated by ',' collection items terminated by '_' map keys terminated by ':' lines terminated by '\n';
row format delimited fields terminated by ',' – column separator
collection items terminated by ‘_’ -- Separator for map structure and ARRAY (data split symbol)
map keys terminated by ':' – separator between key and value in MAP
lines terminated by ‘\n’; – line Separator
2. Guide data
load data local inpath '/opt/module/hive/datas/test.txt' into table test;
3. Create database
create database if not exists db_hive;
4. Show
show databases;
5. Switch database
use db_hive;
6. Create a table based on the query results (the query results are added to the newly created table)
create table if not exists student2 as select id, name from student;
7. Create a table based on an existing table structure
create table if not exists student3 like student;
8. Create external table
Table is an external table, so Hive doesn't think it has the data completely. Deleting the table does not delete the table
Data, but the metadata information describing the table will be deleted.
create external table if not exists dept( deptno int, dname string, loc int ) row format delimited fields terminated by '\t';
9. Modify the internal table student2 to an external table
alter table student2 set tblproperties('EXTERNAL'='TRUE');
FALSE change from external to internal
10. Additional column
alter table dept add columns(deptdesc string);
11. Update column
alter table dept change column deptdesc desc stringļ¼
12. Replace column
alter table dept replace columns(deptno string, dname string, loc string);
13. Delete table
drop table dept;
14. Insert data
1. Single table
insert into table student_par values(1,'wangwu'),(2,'zhaoliu');
2. Basic mode insertion (based on the query results of a single table)
insert overwrite table student_par select id, name from student where month='201709';
3. Multi table (multi partition) insertion mode (based on query results of multiple tables)
from student insert overwrite table student partition(month='201707') select id, name where month='201709' insert overwrite table student partition(month='201706') select id, name where month='201709';
15. Query data
select * from student5;
16. Import data into the specified Hive table
import table student2 from '/user/hive/warehouse/export/student';
Note: export first, and then import the data.
17. Data export
1. Insert export
Export query results to local
insert overwrite local directory '/opt/module/hive/data/export/student' select * from student;
Format and export query results to local
insert overwrite local directory '/opt/module/hive/data/export/student1' ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' select * from student;
Export the query results to HDFS (no local)
insert overwrite directory '/user/atguigu/student2' ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' select * from student;
2. Export Hadoop commands to local
dfs -get /user/hive/warehouse/student/student.txt /opt/module/data/export/student3.txt;
3. Hive Shell command export
bin/hive -e 'select * from default.student;' > /opt/module/hive/data/export/student4.txt;
4. Export to HDFS
export and import are mainly used for Hive table migration between two Hadoop platform clusters.
export table default.student to '/user/hive/warehouse/export/student';
5. Clear the data in the table (Truncate)
truncate table student;
18. Basic query (Select... From)
1. Column alias
Rename a column
Easy to calculate
Immediately following the column name, you can also add the keyword 'AS' between the column name and alias
Case practice
select ename AS name, deptno dn from emp;
2. Common functions
Find the total number of rows (count)
Max (max)
Min (min)
sum
Average value (avg)
select count(*) cnt from emp; select max(sal) max_sal from emp;
3. Limit statement
A typical query returns multiple rows of data. The LIMIT clause is used to LIMIT the number of rows returned.
select * from emp limit 5;
4. Where statement
Use the WHERE clause to filter out rows that do not meet the conditions
The WHERE clause follows the FROM clause
select * from emp where sal >1000;
Note: field aliases cannot be used in the where clause
5. Comparison operator (Between/In/ Is Null)
select * from emp where sal =5000;
Equal to 5000
select * from emp where sal between 500 and 1000;
500 to 1000
select * from emp where comm is null;
comm is null
select * from emp where sal IN (1500, 5000);
1500 or 5000
6. Like and RLike
Beginning with A
select * from emp where ename LIKE 'A%';
The second is A
select * from emp where ename LIKE '_A%';
With A
select * from emp where ename RLIKE '[A]';
7. Logical operators (And/Or/Not)
select * from emp where sal>1000 and deptno=30; select * from emp where sal>1000 or deptno=30; select * from emp where deptno not IN(30, 20);
19. Grouping
1. Group By statement
GROUP BY statements are usually used with aggregate functions to group one or more queued results
Aggregate each group after
select t.deptno, avg(t.sal) avg_sal from emp t group by t.deptno;
Calculate the average salary of each department in emp table
select t.deptno, t.job, max(t.sal) max_sal from emp t group by t.deptno, t.job;
Calculate emp the maximum salary for each position in each department
2. Having statement
Grouping functions cannot be written after where, but can be used after having.
having is only used for group by group statistics statements.
select deptno, avg(sal) from emp group by deptno;
average
select deptno, avg(sal) avg_sal from emp group by deptno having avg_sal > 2000;
More than 2000 on average
20,Join
1. Equivalent Join
select e.empno, e.ename, d.deptno, d.dname from emp e join dept d on e.deptno = d.deptno;
Query employee number, employee name and department name according to the same department number in employee table and department table
2. Table alias
select e.empno, e.ename, d.deptno from emp e join dept d on e.deptno = d.deptno;
Merge employee table and department table
3. Internal connection
Inner join: data matching the join conditions will be retained only if there are two tables to be joined.
select e.empno, e.ename, d.deptno from emp e join dept d on e.deptno = d.deptno;
4. External connection
Left outer JOIN: all records in the table on the left of the JOIN operator that meet the WHERE clause will be returned.
select e.empno, e.ename, d.deptno from emp e left join dept d on e.deptno = d.deptno;
Right outer JOIN: all records in the right table of the JOIN operator that meet the WHERE clause will be returned.
select e.empno, e.ename, d.deptno from emp e right join dept d on e.deptno = d.deptno;
Full outer join: all records in all tables that meet the conditions of WHERE statement will be returned. If the specified field of any table does not have a qualified value, NULL value is used instead.
select e.empno, e.ename, d.deptno from emp e full join dept d on e.deptno = d.deptno;
5. Multi meter connection
create table if not exists location( loc int, loc_name string ) row format delimited fields terminated by '\t';
load data local inpath '/opt/module/datas/location.txt' into table location;
SELECT e.ename, d.dname, l.loc_name FROM emp e JOIN dept d ON d.deptno = e.deptno JOIN location l ON d.loc = l.loc;
In most cases, Hive will start a MapReduce task for each pair of JOIN connection objects. In this example, a MapReduce job will be started to connect table e and table d, and then a MapReduce job will be started to connect the output of the first MapReduce job to Table l; Connect.
Note: why not join tables d and l first? This is because Hive always executes from left to right.
Optimization: when joining three or more tables, if each on Clause uses the same join key, only one MapReduce job will be generated.
21. Cartesian product
The Cartesian product will be generated under the following conditions:
(1) Omit connection conditions
(2) Invalid connection condition
(3) All rows in all tables are connected to each other
select empno, dname from emp, dept;
22. Sorting
1. Order By
Order By: Global sort. There is only one Reducer
ASC (ascend): ascending (default)
DESC (descend): descending
select * from emp order by sal;
select * from emp order by sal desc;
2. Sort by alias
select ename, sal*2 twosal from emp order by twosal;
3. Sorting multiple columns
select ename, deptno, sal from emp order by deptno, sal;
4. Sort By within each Reduce
sort by: for large-scale data sets, order by is very inefficient. In many cases, global sorting is not required. In this case, sort by can be used.
Sort by generates a sort file for each reducer. Each reducer is sorted internally, not for the global result set.
set mapreduce.job.reduces=3;
Set the number of reduce
select * from emp sort by deptno desc;
View employee information in descending order according to department number
insert overwrite local directory '/opt/module/data/sortby-result' select * from emp sort by deptno desc;
Import the query results into the file (sorted by department number in descending order)
5. Distribution by
Distribution by: in some cases, we need to control which reducer a particular row should go to, usually for subsequent aggregation operations. The distribute by clause can do this. distribute by is similar to partition (custom partition) in MR. it is used in combination with sort by. For the distribution by test, you must allocate multiple reduce for processing, otherwise you cannot see the effect of distribution by.
insert overwrite local directory '/opt/module/data/distribute-result' select * from emp distribute by deptno sort by empno desc;
First divide the area according to the department number, and then sort it in descending order according to the employee number.
6. Cluster By
When the distribution by and sorts by fields are the same, the cluster by method can be used.
In addition to the function of distribute by, cluster by also has the function of sort by. However, sorting can only be in ascending order, and the sorting rule cannot be ASC or DESC.
select * from emp cluster by deptno; select * from emp distribute by deptno sort by deptno;
23. Partition table
1. Create partition table syntax
Note: the partition field cannot be the data that already exists in the table. The partition field can be regarded as a pseudo column of the table.
create table dept_partition( deptno int, dname string, loc string ) partitioned by (day string) row format delimited fields terminated by '\t';
2. Load Partition
Note: the partition must be specified when the partition table loads data
load data local inpath '/opt/module/hive/datas/dept_20200401.log' into table dept_partition partition(day='20200401');
3. Query data in partition table
select * from dept_partition where day='20200401';
Single table
select * from dept_partition where day='20200401' union select * from dept_partition where day='20200402' union select * from dept_partition where day='20200403';
select * from dept_partition where day='20200401' or day='20200402' or day='20200403';
Multi partition joint query
4. Add zoning
alter table dept_partition add partition(day='20200404');
alter table dept_partition add partition(day='20200405') partition(day='20200406');
5. Delete partition
alter table dept_partition drop partition (day='20200406');
alter table dept_partition drop partition (day='20200404'), partition(day='20200405');
6. Check how many partitions there are in the partition table
show partitions dept_partition;
7. View partition table structure
desc formatted dept_partition;
24. Secondary partition
1. Create a secondary partition table
create table dept_partition2( deptno int, dname string, loc string ) partitioned by (day string, hour string) row format delimited fields terminated by '\t';
2. Load and query
load data local inpath '/opt/module/hive/datas/dept_20200401.log' into table dept_partition2 partition(day='20200401', hour='12');
select * from dept_partition2 where day='20200401' and hour='12';
3. There are three ways to upload the data directly to the partition directory and associate the partition table with the data
Method 1: repair after uploading data
dfs -mkdir -p /user/hive/warehouse/mydb.db/dept_partition2/day=20200401/hour=13; dfs -put /opt/module/datas/dept_20200401.log /user/hive/warehouse/mydb.db/dept_partition2/day=20200401/hour=13;
msck repair table dept_partition2;
Method 2: add partition after uploading data
alter table dept_partition2 add partition(day='201709',hour='14');
Method 3: after creating a folder, load data to the partition
load data local inpath '/opt/module/hive/datas/dept_20200401.log' into table dept_partition2 partition(day='20200401',hour='15');
25. Dynamic zoning adjustment
When inserting data into a partition table in a relational database, the database will automatically Insert the data into the corresponding partition according to the value of the partition field. Hive also provides a similar mechanism, dynamic partition. However, using hive's dynamic partition requires corresponding configuration.
create table dept_partition_dy(id int, name string) partitioned by (loc int) row format delimited fields terminated by '\t';
set hive.exec.dynamic.partition.mode = nonstrict; hive (default)> insert into table dept_partition_dy partition(loc) select deptno, dname, loc from dept;
Insert the data in the dept table into the target table Dept by Region (loc field)_ In the corresponding partition of partition
Note: the maximum number of dynamic partitions can be created on each node executing MR. This parameter needs to be set according to the actual data. For example, if the source data contains one year's data, that is, the day field has 365 values, then the parameter needs to be set to greater than 365. If the default value of 100 is used, an error will be reported.
hive.exec.max.dynamic.partitions.pernode=100
26. Barrel separation table
Partitioning provides a convenient way to isolate data and optimize queries. However, not all data sets can form reasonable partitions. For a table or partition, Hive can be further organized into buckets, that is, more fine-grained data range division.
Bucket splitting is another technique for breaking up a data set into parts that are easier to manage.
Partition refers to the storage path of data; Buckets are for data files.
1. Create bucket table
Build table
create table stu_buck(id int, name string) clustered by(id) into 4 buckets row format delimited fields terminated by '\t';
View structure
desc formatted stu_buck;
Import data into the bucket splitting table in the way of load
load data inpath '/student.txt' into table stu_buck;
Hive's bucket splitting method hashes the value of the bucket splitting field, and then divides it by the number of buckets to find the remainder
Determine which bucket the record is stored in
2. Matters needing attention in bucket table operation
(1) Set the number of reduce to - 1, and let the Job decide how many reduce to use or set the number of reduce to be greater than or equal to the number of buckets in the bucket splitting table
(2) load data from hdfs to the bucket table to avoid the problem that the local file cannot be found
(3) Do not use local mode
3. Import the data into the bucket table in insert mode
insert into table stu_buck select * from student_insert;
27. Sampling query
For very large data sets, sometimes users need to use a representative query result rather than all the results. Hive can meet this requirement by sampling tables.
Syntax: TABLESAMPLE(BUCKET x OUT OF y)
Query table stu_ Data in buck.
select * from stu_buck tablesample(bucket 1 out of 4 on id);
Note: the value of x must be less than or equal to the value of y, otherwise
FAILED: SemanticException [Error 10061]: Numerator should not be bigger than denominator in sample clause for table stu_buck
28. System built-in functions
1. Empty field assignment
NVL: assign a value to the data whose value is NULL. Its format is NVL (value, default_value). Its function is to return default if value is NULL_ Value, otherwise return the value of value. If both parameters are NULL, return NULL
select comm,nvl(comm, -1) from emp;
If the comm of the employee is NULL, use - 1 instead
select comm, nvl(comm,mgr) from emp;
If the comm of the employee is NULL, the leader id is used instead
2. CASE WHEN THEN ELSE END
Find out the number of men and women in different departments.
select dept_id, sum(case sex when 'male' then 1 else 0 end) male_count, sum(case sex when 'female' then 1 else 0 end) female_count from emp_sex group by dept_id;
3. Line transfer
CONCAT(string A/col, string B/col...): returns the result of connecting input strings. Any input word is supported
Character string;
CONCAT_WS(separator, str1, str2,...): it is a special form of CONCAT(). The separator between the first parameter and the remaining parameters. The delimiter can be the same string as the remaining parameters. If the delimiter is NULL, the return value will also be NULL. This function skips any NULL and empty strings after the delimiter parameter. The separator will be added between the connected strings;
Note: CONCAT_WS must be “string or array”
COLLECT_SET(col): the function only accepts basic data types. Its main function is to de summarize the values of a field and generate an Array type field.
SELECT t1.c_b, CONCAT_WS("|",collect_set(t1.name)) FROM ( SELECT NAME, CONCAT_WS(',',constellation,blood_type) c_b FROM person_info )t1 GROUP BY t1.c_b
Classify people with the same constellation and blood type.
4. Column to row
Expand (Col): split the complex Array or Map structure in the hive column into multiple rows.
LATERAL VIEW
Usage: final view udtf (expression) tablealias as columnalias
Explanation: used with split, expand and other udtfs. It can split a column of data into multiple rows of data. On this basis, it can aggregate the split data.
SELECT movie, category_name FROM movie_info lateral VIEW explode(split(category,",")) movie_info_tmp AS category_name;
Expand the array data in the movie category.
5. Window function (windowing function)
OVER(): Specifies the size of the data window in which the analysis function works. The size of the data window may change with the change of rows.
CURRENT ROW: CURRENT ROW
N predicting: data in the previous n rows
n FOLLOWING: data in the next n rows
UNBOUNDED: starting point,
Unbounded forecasting means from the previous starting point,
UNBOUNDED FOLLOWING indicates to the following end point
LAG(col,n,default_val): data in the nth row ahead
LEAD(col,n, default_val): data of the nth row in the future
NTILE(n): distribute the rows of the ordered window to the groups of the specified data. Each group has a number, starting from 1. For each row, NTILE returns the number of the group to which the row belongs. Note: n must be of type int.
jack,2017-01-01,10
create table business( name string, orderdate string, cost int ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ','; load data local inpath "/opt/module/data/business.txt" into table business;
select name,count(*) over () from business where substring(orderdate,1,7) = '2017-04' group by name;
Query the customers and total number of people who purchased in April 2017
select name,orderdate,cost,sum(cost) over(partition by month(orderdate)) from business;
Query the customer's purchase details and monthly total purchase amount
select name,orderdate,cost, sum(cost) over() as sample1,--Add all rows sum(cost) over(partition by name) as sample2,--Press name Grouping, intra group data addition sum(cost) over(partition by name order by orderdate) as sample3,--Press name Grouping, intra group data accumulation sum(cost) over(partition by name order by orderdate rows between UNBOUNDED PRECEDING and current row ) as sample4 ,--and sample3 equally,From starting point to Aggregation of current row sum(cost) over(partition by name order by orderdate rows between 1 PRECEDING and current row) as sample5, --Aggregate the current row with the previous row sum(cost) over(partition by name order by orderdate rows between 1 PRECEDING AND 1 FOLLOWING ) as sample6,--The current line and the previous and subsequent lines sum(cost) over(partition by name order by orderdate rows between current row and UNBOUNDED FOLLOWING ) as sample7 --Current line and all subsequent lines from business;
The cost of each customer is accumulated according to the date
select name,orderdate,cost, lag(orderdate,1,'1900-01-01') over(partition by name order by orderdate ) as time1, lag(orderdate,2) over (partition by name order by orderdate) as time2 from business;
View the customer's last purchase time
select * from ( select name,orderdate,cost, ntile(5) over(order by orderdate) sorted from business ) t where sorted = 1;
Order information 20% before query
6. Rank
If RANK() sorting is the same, it will be repeated, and the total number will not change
DENSE_ If rank() sorting is the same, it will be repeated and the total number will be reduced
ROW_NUMBER() is calculated in order
create table score( name string, subject string, score int) row format delimited fields terminated by "\t"; load data local inpath '/opt/module/data/score.txt' into table score;
select name, subject, score, rank() over(partition by subject order by score desc) rp, dense_rank() over(partition by subject order by score desc) drp, row_number() over(partition by subject order by score desc) rmp from score;
name subject score rp drp rmp Song English 84 1 Sea English 84 1 2 Tingting English 78 3 2 3 Monkey king English 68 4 3 4