业务上常见的SQL写法

记录一些常用的SQL业务

拉链表

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# test_hits拉链表
"10001"|"测试"|"54"|"2020-01-01"|"9999-12-31"
"10002"|"运行"|"90"|"2020-01-01"|"9999-12-31"
"10003"|"测试"|"10"|"2020-01-01"|"9999-12-31"

# test_update活跃表
"10001"|"运行"|"90"|"2020-01-02"
"10008"|"测试"|"70"|"2020-01-02"

# 方式一,重跑需要回滚操作
SELECT uid,state,VALUE,start_time,end_time
FROM (
SELECT a.uid,a.state,a.value,
a.start_time,
CASE
WHEN a.end_time = '9999-12-31' AND today.uid IS NOT NULL THEN '2020-01-01'
ELSE a.end_time
END AS end_time
FROM (
SELECT *
FROM test_his
WHERE start_time <= '2020-01-01'
AND end_time >= '2020-01-01'
) a
LEFT JOIN (
SELECT *
FROM test_update
) today
ON a.uid=today.uid
UNION
SELECT uid,state,VALUE,
'2020-01-02' AS start_time,
'9999-12-31' AS end_time
FROM test_update
) his;

# 查询快照数据
SELECT *
FROM test_his
WHERE start_time <= '2020-01-01'
AND end_time >= '2020-01-01';

# 回滚操作
SELECT *
FROM test_his
WHERE end_time < '2020-01-02'
UNION ALL
SELECT uid,state,VALUE,start_time,'9999-12-31' AS end_time
FROM test_his
WHERE end_time = '2020-01-02'
UNION ALL
SELECT uid,state,VALUE,start_time,'9999-12-31' AS end_time
FROM test_his
WHERE start_time < '2020-01-03' AND end_time >= '2020-01-03';

# 方式二,重跑不需要回滚操作
SELECT uid,state,VALUE,start_time,end_time
FROM (
SELECT a.uid,a.state,a.value,
a.start_time,
IF(today.uid IS NULL,'9999-12-31','2020-01-01') AS end_time
FROM (
SELECT *
FROM test_his
WHERE start_time <= '2020-01-01'
AND end_time >= '2020-01-01'
) a
LEFT JOIN (
SELECT *
FROM test_update
WHERE dt='2020-01-02'
) today
ON a.uid=today.uid
UNION
SELECT uid,state,VALUE,
'2020-01-02' AS start_time,
'9999-12-31' AS end_time
FROM test_update
WHERE dt='2020-01-02'
) his;

连续登陆

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
查询连续登陆超过N天的用户,最长登陆天数
# 测试数据 test_login
"10001"|"测试"|"70"|"2020-01-02"
"10001"|"测试"|"70"|"2020-01-03"
"10001"|"测试"|"70"|"2020-01-04"
"10001"|"测试"|"70"|"2020-01-05"
"10001"|"测试"|"70"|"2020-01-09"
"10001"|"测试"|"70"|"2020-01-10"
"10001"|"测试"|"70"|"2020-01-11"
"10001"|"测试"|"70"|"2020-01-13"

# 肉眼看,10001连续登陆02-05[4天],09-11[3天]
SELECT uid,max(continuity_days) continuity_days
FROM (
SELECT uid,flag_date,count(1) continuity_days
FROM (
SELECT uid,login_date,date_sub(login_date,rk) flag_date
FROM (
SELECT uid,login_date,row_number() over(partition by uid order by login_date) rk
FROM test_login
) t1
) t1
GROUP BY uid,flag_date
HAVING count(1) >= 3
) t1
GROUP BY uid

# 换个思路,我们可以求解出10月连续充值大于等于N天,10月充值金额总额,连续充值最长天数,连续充值最大金额
# 注意,连续充值最大金额并不一定在连续充值最长天数内

留存与流失计算

1
2
3
4
5
6
7
8
9
10
11
12
13
流失使用的是创建一个当前日期的分区,里面子分区为各流失日期与当前日期的差值(便于计算回归)
每次都是新创建分区
现日期2020-05-05,那么2020-05-04分区将被创建
2020-05-04
1,3,7,14,30
代表当前日期-1,-3,-7,-14,-30到现在的流失数

留存使用的是历史留存日期分区,在里面创建当前日期的新子分区
使用的都是历史分区(当前日期的主分区需要+1天才进行创建)
现日期2020-05-05,那么2020-05-04分区将在2020-05-06时创建
2020-05-03
2020-05-04
代表2020-05-03到现在的留存