数据分析-SQL数据分析实操

描述性分析、分组分析、结构分析、分布分析、交叉分析、矩阵分析、转化分析、RFM分析、留存分析、生命周期分析

葉夏夏

2184人浏览 · 2025-08-31 23:03:19

葉夏夏 · 2025-08-31 23:03:19 发布

描述性分析

知识点

描述性分析，也称描述性统计分析，它是对数据进行分析，得出反映客观现象的各种数量特征的一种分析方法，主要包括数据的集中趋势分析、离散程度分析、频数分布分析等，常用的统计指标有计数、求和、平均值、最小值、最大值、标准差、方差等。

MySQL中对应的函数分别为Count（计数）、Sum（求和）、Avg（平均值）、Min（最小值）、Max（最大值）、StdDev（标准差_英文全程standard deviation）、Variance（方差）。

select
	count(price) as count,
	sum(price) as sum,
	avg(price) as avg,
	min(price) as min,
	max(price) as max,
	stddev(price) as stdev,
	variance(price) as var
from item;

分组分析

简单分组统计

统计不同出版社商品的平均价格

select 
	press,
	avg(price) as price_avg
from item
group by press;

分组排序统计

知识点

MySQL数据框LIMIT用法

SELECT 字段

FROM 表

WHERE 过滤条件

GROUP BY 字段

ORDER BY 字段

LIMIT n

select 
	press,
	count(item_id) as item_cnt
from item
group by press
order by count(item_id) desc;

-- 商品数前5的结果
select 
	press,
	count(item_id) as item_cnt
from item
group by press
order by count(item_id) desc
limit 5;

分组过滤统计

出版社的商品数大于3的数据

知识点

HAVING子句用法

SELECT 字段

FROM 表

WHERE 过滤条件

GROUP BY 字段

HAVING 条件

ORDER BY 字段

select
	press,
	count(item_id) as item_cnt
from item
group by press 
having count(item_id) > 3
order by count(item_id) desc;

分组去重统计

2020年每月登录用户数

关键技巧

1- 去重统计，Count(DISTINCT uid)

2- 日期转文本，Date_format(date,'%Y-%m')

3- GROUP BY子句中不能使用字段的别名，因为SELECT在GROUP BY之后执行，别名还未生效。

SQL执行顺序

FROM 表

JOIN ON

WHERE 过滤行条件

GROUP BY 字段

HAVING 过滤组条件

SELECT 字段

ORDER BY 字段

LIMIT

select
	date_format(date,'%Y-%m') as 'year_month',
	count(distinct uid) as count
from login
where `date` between '2020/1/1' and '2020/12/31'
group by date_format(date,'%Y-%m');

结构分析

简单结构分析

select 
	press,
	count(item_id) as count,
	-- (select count(item_id) from item) as total,
	count(item_id)/(select count(item_id) from item) as p
from item 
group by press
order by count(item_id) desc;

组内占比

知识点

窗口函数定义：

窗口函数(Windows Function)又叫分析函数(Analytic Function)，可以像聚合函数一样对一组数据进行分析并返回结果，二者不同之处在于，窗口函数不是将一组数据汇总成单个结果，而是为每一行数据都返回一个结果。

窗口函数语法：

<窗口函数> OVER (

[PARTITION BY <用于分组的列>]

[ORDER BY <用于排序的列>]

[frame_clause<设定窗口大小>]

)

常用的窗口函数有：

SUM()、AVG()、COUNT()、MAX()、MIN()、

RANK()、DENSE_RANK()、ROW_NUMBER()等。

select
	item.*,
	sum(price) over (partition by press) as 组内求和,
	(price / sum(price) over (partition by press)) as 组内占比,
	rank() over (partition by press order by price desc) as 组内排名
from item;

分布分析

基于表item，新增字段labels，对“price”字段进行以50元、100元为区间临界值的价格段分组操作。

知识点

CASE语句用法

CASE

WHEN 条件表达式1 TEHN 结果1

……

WHEN 条件表达式n THEN 结果n

ELSE 其他结果

END

通用表表达式

Common Table Expression简称CTE，用于简化复杂的子查询和连接查询，提高SQL语句的可读性和性能。

临时表 with…as

with

临时表名称 as 子查询语句,

…

-- case 语句
select
	case
		when price > 100 then '(100,+∞)' 
		when price > 50 then '(50,100]'
		else '(0,50]'
	end as labels,
	count(item_id) as count
from item
group by 
	case
		when price > 100 then '(100,+∞)' 
		when price > 50 then '(50,100]'
		else '(0,50]'
	end;

-- 不是所有数据库都可以直接写成group by labels
select
	case
		when price > 100 then '(100,+∞)' 
		when price > 50 then '(50,100]'
		else '(0,50]'
	end as labels,
	count(item_id) as count
from item
group by labels;

-- 临时表CTE
with a as 
(
select
	item.*,
	case
		when price > 100 then '(100,+∞)' 
		when price > 50 then '(50,100]'
		else '(0,50]'
	end as labels
from item
)

select
	labels,
	count(item_id) as count
from a
group by labels;

交叉分析

交叉分析1

-- 写法一
with a as
(
select 
	item.*,
	(case when price <= 50 then '50元以内' else null end) as '50元以内',
	(case when price > 50 then '50元以上' else null end) as '50元以上'
from item
)

select
	press,
	count(50元以内) as '50元以内',
	count(50元以上) as '50元以上'
from a
group by press;

-- 写法二
select
	press,
	count((case when price <= 50 then '50元以内' else null end)) as '50元以内',
	count((case when price > 50 then '50元以上' else null end)) as '50元以上'
from item
group by press;

交叉分析2

select
	press,
	count((case when price <= 50 then '50元以内' else null end)) as '50元以内',
	count((case when price > 50 then '50元以上' else null end)) as '50元以上',
	count(item_id) as 总计
from item
group by press;

一维表转二维表

select 
	地区,
	sum((case when 年份 = '2006年' then GDP else 0 end)) as '2006年',
	sum((case when 年份 = '2007年' then GDP else 0 end)) as '2007年',
	sum((case when 年份 = '2008年' then GDP else 0 end)) as '2008年'
from 一维表
group by 地区;

矩阵分析

select 
	province,
	avg(cost) as cost_avg,
	avg(flow) as flow_avg
from telecom
group by province
order by convert(province using gbk);

-- 写法一
select 
	b.area,
	avg(cost) as cost_avg,
	avg(flow) as flow_avg
from telecom as a
left join area as b
on a.province = b.province
group by area;

-- 写法二
with c as 
(
select 
	a.*,
	b.area
from telecom as a
left join area as b
on a.province = b.province
)

select
	area,
	round(avg(cost),2) as cost_avg,
	round(avg(flow),2) as flow_avg
from c
group by area;

转化分析

select 
	behavior_type,
	count(uid) as uid_count,
	-- (select count(uid) from behavior where behavior_type = 1) as total_type1,
	count(uid) / (select count(uid) from behavior where behavior_type = 1) as rate
from behavior
group by behavior_type;

RFM分析

计算RFM数据指标

select
	a.uid,
	timestampdiff(day,max(a.date),'2020/12/31') as r,
	count(uid) as f,
	sum(b.price) as m
from behavior as a
left join item as b 
on a.item_id = b.item_id
where a.behavior_type = 3
group by a.uid
order by a.uid

计算R分值、F分值、M分值和RFM分值

select
	rfm.*,
	if(r < (select avg(r) from rfm),2,1) as r_s,
	if(f > (select avg(f) from rfm),2,1) as f_s,
	if(m > (select avg(m) from rfm),2,1) as m_s,
	if(r < (select avg(r) from rfm),2,1)*100+
	if(f > (select avg(f) from rfm),2,1)*10+
	if(m > (select avg(m) from rfm),2,1) as rfm_score
from rfm

根据RFM分值细分客户

select 
	c.rfm_score,
	d.type,
	count(uid) as cnt
from rfm_score as c
left join rfm_type as d
on c.rfm_score = d.rfm_score
group by c.rfm_score
order by c.rfm_score

-- 完整代码

with rfm as 
(
select
	a.uid,
	timestampdiff(day,max(a.date),'2020/12/31') as r,
	count(uid) as f,
	sum(b.price) as m
from behavior as a
left join item as b 
on a.item_id = b.item_id
where a.behavior_type = 3
group by a.uid
order by a.uid
)
,
rfm_score as 
(
select
	rfm.*,
	if(r < (select avg(r) from rfm),2,1) as r_s,
	if(f > (select avg(f) from rfm),2,1) as f_s,
	if(m > (select avg(m) from rfm),2,1) as m_s,
	if(r < (select avg(r) from rfm),2,1)*100+
	if(f > (select avg(f) from rfm),2,1)*10+
	if(m > (select avg(m) from rfm),2,1) as rfm_score
from rfm
)

select 
	c.rfm_score,
	d.type,
	count(uid) as cnt
from rfm_score as c
left join rfm_type as d
on c.rfm_score = d.rfm_score
group by c.rfm_score
order by c.rfm_score;

留存分析

2020/1/1到2020/1/6日新增用户的次日留存率

-- 写法一
with c as 
(
select 
	a.uid,
	a.date as new_date,
	b.date as date2,
	timestampdiff(day,a.date,b.date) as day
from login as a
left join (select * from login where date between '2020/1/1' and '2020/1/7') as b
on a.uid = b.uid
where a.type = 1 and a.date between '2020/1/1' and '2020/1/6'
order by a.uid,a.date,b.date
)

select 
	new_date,
	count(if(day = 0,uid,null)) as day0,
	count(if(day = 1,uid,null)) as day1,
	count(if(day = 1,uid,null))/count(if(day = 0,uid,null)) as 次日留存率
from c
group by new_date; 

-- 写法二
select
	a.date,
	sum(case when b.uid is not null then 1 else 0 end)/count(a.uid) as 次日留存率
from (select uid,date,type from login) as a 
left join (select uid,date,type from login where date between '2020/1/1' and '2020/1/7') as b
on a.uid = b.uid and b.date = date_add(a.date,interval 1 day)
where a.type = 1 and a.date between '2020/1/1' and '2020/1/6'
group by a.date;

生命周期分析

计算每日新增用户明细

select 
	uid,
	date as new_date
from login
where type = 1 and date between '2020/12/1' and '2020/12/6'

计算每日每个客户的消费金额

select 
	b.uid,
	b.date as pay_date,
	sum(c.price) as pay
from behavior as b
left join item as c
on b.item_id = c.item_id
where b.behavior_type = 3 and b.date between '2020/12/1' and '2020/12/12'
group by b.uid,b.date
order by b.uid

将新增用户数与用户消费数据连接，计算用户消费日与新增日的间隔天数

select 
	a.*,
	d.pay_date,
	d.pay,
	timestampdiff(day,a.new_date,d.pay_date) as day
from a
left join d on a.uid = d.uid
order by a.uid

计算新增用户的LTV

select 
	new_date,
	count(distinct uid) as 新增用户数,
	sum(if(day = 0,pay,0)) as payday1,
	sum(if(day between 0 and 6,pay,0)) as payday7,
	sum(if(day = 0,pay,0))/count(distinct uid) as LTV1,
	sum(if(day between 0 and 6,pay,0))/count(distinct uid) as LTV7
from e
group by new_date

-- 完整代码
with a as 
(
select 
	uid,
	date as new_date
from login
where type = 1 and date between '2020/12/1' and '2020/12/6'
)
,
d as 
(
select 
	b.uid,
	b.date as pay_date,
	sum(c.price) as pay
from behavior as b
left join item as c
on b.item_id = c.item_id
where b.behavior_type = 3 and b.date between '2020/12/1' and '2020/12/12'
group by b.uid,b.date
order by b.uid
)
,
e as 
(
select 
	a.*,
	d.pay_date,
	d.pay,
	timestampdiff(day,a.new_date,d.pay_date) as day
from a
left join d on a.uid = d.uid
order by a.uid
)

select 
	new_date,
	count(distinct uid) as 新增用户数,
	sum(if(day = 0,pay,0)) as payday1,
	sum(if(day between 0 and 6,pay,0)) as payday7,
	sum(if(day = 0,pay,0))/count(distinct uid) as LTV1,
	sum(if(day between 0 and 6,pay,0))/count(distinct uid) as LTV7
from e
group by new_date;

永洪数据分析社区

永洪科技，致力于打造全球领先的数据技术厂商，具备从数据应用方案咨询、BI、AIGC智能分析、数字孪生、数据资产、数据治理、数据实施的端到端大数据价值服务能力。

更多推荐

java计算机毕业设计教师工作量统计系统基于SpringBoot的高校教师绩效测算与可视化平台教师教学任务与工作量智能汇总系统

永洪数据分析社区

BI是报表？BI是可视化？BI到底是什么？

永洪数据分析社区

AI智能体+BI可视化：1小时极速入门，市场部独力完成分析报告

商务蓝：适合正式报告活力橙：适合创意活动环保绿：适合可持续发展主题自主分析能力：不再依赖IT部门，活动当天即可产出分析结论智能数据处理：AI智能体自动完成80%的数据清洗和分析工作专业可视化：通过PowerBI制作媲美专业数据分析师的报告持续优化：建立可复用的分析模板，后续活动效率更高决策支持：基于数据快速调整营销策略，提升活动ROI现在就可以尝试部署你的第一个智能分析环境，体验从数据到决策的高速