Window opening function of hive

Window function structure:
Analytical function + window clause

Window clause
over(partition by ** order by ** rows between ** and **)
Among them, between ** and ** can be filled in

Start	End	explain
unbounded preceding	current row	From the first line to the current line, this is the default
3 preceding	current row	From the front three lines to the current line
3 preceding	3 following	Current row, three lines forward and three lines backward
current row	unbounded following	From the current line to the back

Analysis functions include

Analytical function	usage
Sum (column name)	When there is order by, it calculates the superimposed values after the partition sorting, and the synthesis within the partition without order by
Min (column name), Max (column name), avg (column name)	Related to order by
count (column name)	Related to order by
row_number()	Starting from 1, the sequence 1, 2, 3, 4, 5 of the records in the grouping is generated in sequence.
rank()	Generate the ranking of data items in the group. Equal ranking will leave 1, 2, 3, 3, 5 vacancies in the ranking.
dense_rank()	Generate the ranking of data items in a group. Equal ranking will not leave a vacancy of 1, 2, 3, 3, 4 in the ranking. First order by
ntile(n)	Divide the grouped data into n slices in order and return the current slice values 1, 2,... If 7 rows are divided into 3 parts, the returned slices are 1, 1, 1, 2, 2, 3, 3; rows between is not supported, order by
cum_dist()	Returns the number of rows less than or equal to the current value/the total number of rows in the group, first order by
percent_rank()	(rank value of the current row in the group - 1)/(total number of rows in the group - 1), order by
Lag (column name, up n rows, default value)	Returns the value of line n upward, or the default value or null if not
Lead (column name, down n rows, default value)	Return to the value in line n below, or to the default value or null if not
First_value (column name)	After sorting within a group, the first value up to the current row is related to order by and rows between.
Last_value (column name)	After sorting the groupings, the last value up to the current row is related to order by and rows between.

Exercise 1

Exercises

Create tables

-- CREATE TABLE IF NOT EXISTS student_scores(
-- id INT,
-- studentId INT,
-- language INT,
-- math INT,
-- english INT,
-- classId STRING,
-- departmentId STRING
-- );

insert data

-- insert into table student_scores values 
--   (1,111,68,69,90,'class1','department1'),
--   (2,112,73,80,96,'class1','department1'),
--   (3,113,90,74,75,'class1','department1'),
--   (4,114,89,94,93,'class1','department1'),
--   (5,115,99,93,89,'class1','department1'),
--   (6,121,96,74,79,'class2','department1'),
--   (7,122,89,86,85,'class2','department1'),
--   (8,123,70,78,61,'class2','department1'),
--   (9,124,76,70,76,'class2','department1'),
--   (10,211,89,93,60,'class1','department2'),
--   (11,212,76,83,75,'class1','department2'),
--   (12,213,71,94,90,'class1','department2'),
--   (13,214,94,94,66,'class1','department2'),
--   (14,215,84,82,73,'class1','department2'),
--   (15,216,85,74,93,'class1','department2'),
--   (16,221,77,99,61,'class2','department2'),
--   (17,222,80,78,96,'class2','department2'),
--   (18,223,79,74,96,'class2','department2'),
--   (19,224,75,80,78,'class2','department2'),
--   (20,225,82,85,63,'class2','department2');

count()

SELECT studentid,math,departmentid,classid,
count(math) over() as count1,  
count(math) over(PARTITION BY classid) as count2,
count(math) OVER(PARTITION BY classid ORDER BY math ) as count3,
count(math) over(PARTITION BY classid ORDER BY math ROWS BETWEEN 1 PRECEDING and 2 FOLLOWING) as count4
FROM student_scores WHERE departmentid='department1'

Interpretation: For a row, count1 counts all rows, count2 counts rows with different classids, count3 counts rows with current classid ranking less than or equal to the current value, count4 counts the total rows of the previous row, the last two rows and the current row (based on the actual number of rows).

sum()

SELECT studentid,math,departmentid,classid,
sum(math) OVER() as sum1,
sum(math) over(PARTITION BY classid) as sum2,
sum(math) over(PARTITION BY classid order by math) as sum3,
sum(math) over(PARTITION BY classid ORDER BY math ROWS BETWEEN 1 PRECEDING AND 2 FOLLOWING) as sum4
FROM student_scores
WHERE departmentid='department1'

min()

SELECT studentid,math,departmentid,classid,
min(math) over() as min1,
min(math) OVER(PARTITION BY classid) as min2,
min(math) OVER(PARTITION BY classid order by classid) as min3,
min(math) OVER(PARTITION BY classid order by classid ROWS BETWEEN 1 PRECEDING AND 2 FOLLOWING) as min4
FROM student_scores
where departmentid='department1'

max()

SELECT studentid,math,departmentid,classid,
max(math) over() as max1,
max(math) over(PARTITION BY classid) as max2,
max(math) over(PARTITION BY classid order by math) as max3,
max(math) over(PARTITION BY classid order by math ROWS BETWEEN 1 PRECEDING and 2 FOLLOWING) as max4
FROM student_scores
where departmentid='department1'

avg()

SELECT studentid,math,departmentid,classid,
round(avg(math) over(),2) as max1,
round(avg(math) over(PARTITION BY classid) ,2) as max2,
round(avg(math) over(PARTITION BY classid order by math) ,2) as max3,
round(avg(math) over(PARTITION BY classid order by math ROWS BETWEEN 1 PRECEDING and 2 FOLLOWING) ,2) as max4
FROM student_scores
where departmentid='department1'

first_value()

SELECT studentid,math,departmentid,classid,
first_value(math) over() as first_value1,
first_value(math) OVER(PARTITION BY classid) as first_value2,
first_value(math) OVER(PARTITION BY classid ORDER BY math) as first_value3,
first_value(math) OVER(PARTITION BY classid ORDER BY math ROWS BETWEEN 1 PRECEDING and 2 FOLLOWING) as first_value4
FROM student_scores
where departmentid='department1'

last_value()

SELECT studentid,math,departmentid,classid,
last_value(math) over() as last_value1,
last_value(math) OVER(PARTITION BY classid) as last_value2,
last_value(math) OVER(PARTITION BY classid ORDER BY math) as last_value3,
last_value(math) OVER(PARTITION BY classid ORDER BY math ROWS BETWEEN 1 PRECEDING and 2 FOLLOWING) as last_value4
FROM student_scores
where departmentid='department1'

lag()

SELECT studentid,math,departmentid,classid,
lag(math,2,60) over(PARTITION BY classid order by math) as lag1,
lag(math,2) over(PARTITION BY classid order by math) as lag2
FROM student_scores
where departmentid='department1'

lead()

SELECT studentid,math,departmentid,classid,
lead(math,2,60) over(PARTITION BY classid order by math) as lead1,
lead(math,2) over(PARTITION BY classid order by math) as lead2
FROM student_scores
where departmentid='department1'

cume_dist()

SELECT studentid,math,departmentid,classid,
round(cume_dist() over(order by math),2) as cum_dist1,
round(cume_dist() OVER(order by math desc) ,2)as cum_dist2,
round(cume_dist() OVER(PARTITION BY classid order by math desc),2) as cum_dist3
FROM student_scores
where departmentid='department1'

rank()

SELECT studentid,math ,
rank() OVER(ORDER BY math) as rank1,
rank() OVER(PARTITION BY departmentid ORDER BY math) as rank2,
rank() over(PARTITION BY departmentid,classid ORDER BY math) as rank3
FROM student_scores

dense_rank()

SELECT studentid,math ,
dense_rank() OVER(ORDER BY math) as dense_rank1,
dense_rank() OVER(PARTITION BY departmentid ORDER BY math) as dense_rank2,
dense_rank() over(PARTITION BY departmentid,classid ORDER BY math) as dense_rank3
FROM student_scores

row_number()

SELECT studentid,departmentid,classid,math,
row_number() over(PARTITION BY departmentid,classid ORDER BY math) as row_number
FROM student_scores

percent_rank()

SELECT studentid,departmentid,classid,math,
row_number() over(PARTITION BY departmentid,classid ORDER BY math) as row_number1,
percent_rank() OVER(PARTITION BY departmentid,classid ORDER BY math) as percent_rank1
FROM student_scores

ntile(n)

SELECT studentid,math,
ntile(2) over(PARTITION BY departmentid order by math) as ntile1,
ntile(3) over(PARTITION BY departmentid order by math) as ntile2
FROM student_scores

Exercise 2

Exercise 2
Create and insert data

-- INSERT INTO hivewindowtest1 VALUES

-- ('tony','2017-01-02',15),

-- ('jack','2017-02-03',23),

-- ('tony','2017-01-04',29),

-- ('jack','2017-01-05',46),

-- ('jack','2017-04-06',42),

-- ('tony','2017-01-07',50),

-- ('jack','2017-01-08',55),

-- ('mart','2017-04-08',62),

-- ('mart','2017-04-09',68),

-- ('neil','2017-05-10',12),

-- ('mart','2017-04-11',75),

-- ('neil','2017-06-12',80),

-- ('mart','2017-04-13',94);

(1) Query the total number of customers and customers purchased in April 2017

SELECT name,count(*) over()
FROM hivewindowtest1
where substring(orderdate,1,7)='2017-04'
group by name

(2) Query the customer's purchase details and total monthly purchases

SELECT *,
sum(cost) OVER(PARTITION BY name,month(orderdate))
FROM hivewindowtest1

(3) In the above scenario, the cost is accumulated by date.

SELECT *,
sum(cost) OVER(PARTITION BY name ORDER BY orderdate)
FROM hivewindowtest1

(4) Query the customer's last purchase time

SELECT *,
lag(orderdate,1) OVER(PARTITION BY name ORDER BY orderdate) as lastbuytime
FROM hivewindowtest1

(5) Query the order information for the first 20% of the time

SELECT name,orderdate,cost from 
(SELECT name,orderdate,cost,
ntile(5) over(ORDER BY orderdate) as hh
FROM hivewindowtest1) TT
where TT.hh=1

Keywords: less

Added by SunsetKnight on Fri, 30 Aug 2019 14:10:36 +0300

Programming VIP