
pandas数据分析,matplotlib绘图(折线图),图片和数据批量写入excel
pandas数据分析,绘制折线图,批量写入数据和图片到excel,matplotlib折线图绘制封装
·
pandas数据分析,matplotlib绘图并解决中文乱码问题
批量将数据写入多个sheet的excel表,并将图片和数据写入同一个sheet,查了不少api,希望对大家有帮助
代码就不一一解释了,自己看吧
最终生成效果图
# -*- coding: utf-8 -*-
# @File : data_statistics_and_draw_imgs.py
# @Date : 2023-03-27
# @Author : jiang
import sys
sys.path.append("..")
print(sys.path)
import traceback
import pandas as pd
import logging
import math
from datetime import datetime
from datetime import timedelta
import matplotlib.pyplot as plt
import matplotlib as mpl
plt.switch_backend('agg')
# 设置背景为白色
plt.rcParams['axes.facecolor'] = 'w'
# mac电脑字体
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']
mpl.rcParams['font.serif'] = ['Arial Unicode MS']
# linux字体
# plt.rcParams['font.sans-serif'] = ['Microsoft YaHei'] # 用来正常显示中文标签
# mpl.rcParams['font.serif'] = ['Microsoft YaHei']
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
logging.basicConfig(stream=sys.stdout, level=logging.INFO,
datefmt='%Y-%m-%d %H:%M:%S',
format='%(asctime)s-%(levelname)s: -> %(message)s')
class ProfileUtils(object):
def __init__(self):
self.dir = "/Users/jiang/Downloads/"
self.data_path = "{}/data.txt".format(self.dir)
self.save_path = "{}/用户画像.xls".format(self.dir)
self.columns = ['账号',
'访问日期',
'画像丰满度',
'GPS',
'最近点击过的v1',
'最近点击过的v2',
'最近点击过的guid',
'最近点击过的topic',
'最近点击过的实体',
'最近点击过的泛化实体',
'关注',
'操作系统',
'机型',
'更新时间',
'版本',
'white_group']
def read_data(self, path, columns):
return pd.read_csv(path, sep='\t', header=None, names=columns)
def save_df(self, path, df):
return df.to_csv(path, sep='\t', index=None, header=None)
def save_data_to_excel(self, save_name, all_cols_list, img_path_dic):
writer = pd.ExcelWriter(save_name, engine='xlsxwriter')
for sheet_name, df_item in all_cols_list:
if sheet_name == '画像明细':
df_item.to_excel(writer, sheet_name=sheet_name, index=False)
else:
df_item.to_excel(writer, sheet_name=sheet_name, index=False)
sheet = writer.book.sheetnames[sheet_name]
sheet.insert_image('H2', img_path_dic[sheet_name])
writer.save()
writer.close()
def days(self, day1, day2):
d1 = datetime(int(day1[:4]), int(day1[4:6]), int(day1[-2:]))
d2 = datetime(int(day2[:4]), int(day2[4:6]), int(day2[-2:]))
interval = d2 - d1 # 两日期差距
return interval.days # 具体的天数
def parse_gps(self, row, city_dic):
try:
GPS = row['GPS']
if pd.isna(GPS):
city = "未知"
elif GPS == 'None':
city = "未知"
else:
lis = GPS.split("|")
if len(lis) > 1:
if '中国' in GPS:
city = GPS.split("|")[1]
else:
city = GPS.split("|")[0]
elif len(lis) == 1:
city = GPS.split("|")[0]
else:
city = "未知"
if "�" in city or len(city) == 1:
city = "未知"
if city not in city_dic:
city_dic[city] = 0
city_dic[city] += 1
except Exception as e:
logging.error(f"parse_gps error {traceback.format_exc()}")
def click_tag(self, row, data_dic, col):
try:
tag = row[col]
if not (pd.isna(tag) or tag == 'None'):
lis = tag.split(",")
# 财经|1|4,其他|1|5
for v1_s in lis:
ll = v1_s.split("|")
if len(ll) == 3:
v1_cv = ll[0]
c = int(ll[1])
e = int(ll[2])
if v1_cv not in data_dic:
data_dic[v1_cv] = {
"曝光": 0,
"点击": 0,
"用户数": 0
}
data_dic[v1_cv]['曝光'] += e
data_dic[v1_cv]['点击'] += c
data_dic[v1_cv]['用户数'] += 1
except Exception as e:
logging.error(f"click_tag error {traceback.format_exc()}")
def subscribe(self, row, ts_dic, col):
# 关注
try:
subs = row[col]
if not (pd.isna(subs) or subs == 'None'):
lis = subs.split(",")
# 账号维度:账号关注的人数,关注的日期,关注的小时数
# 时间维度:某一天,某小时,关注了哪些账号
# "4000000089717384490|20230227 15:56:00,80572276|20230101 11:29:01'
for sub_s in lis:
ll = sub_s.split("|")
if len(ll) == 2:
# guid = ll[0]
ts = ll[1]
ts_ll = ts.split(" ")
# day = ts_ll[0]
hour = ts_ll[1][:2]
if hour not in ts_dic:
ts_dic[hour] = 0
ts_dic[hour] += 1
except Exception as e:
logging.error(f"subscribe error {traceback.format_exc()}")
def calc_data(self, df):
v1_dic = {}
df.apply(self.click_tag, axis=1, args=(v1_dic, "最近点击过的v1",))
v2_dic = {}
df.apply(self.click_tag, axis=1, args=(v2_dic, "最近点击过的v2",))
entity_dic = {}
df.apply(self.click_tag, axis=1, args=(entity_dic, "最近点击过的实体",))
generate_en_dic = {}
df.apply(self.click_tag, axis=1, args=(generate_en_dic, "最近点击过的泛化实体",))
topic_dic = {}
df.apply(self.click_tag, axis=1, args=(topic_dic, "最近点击过的topic",))
guid_dic = {}
df.apply(self.click_tag, axis=1, args=(guid_dic, "最近点击过的guid",))
return v1_dic, v2_dic, entity_dic, generate_en_dic, topic_dic, guid_dic
def dic2df(self, data_dic, keys, sort=False, sort_value=False):
try:
if sort and sort_value:
data_dic = dict(sorted(data_dic.items(), key=lambda item: item[1], reverse=True))
elif sort:
data_dic = dict(sorted(data_dic.items(), key=lambda item: item[0]))
sub_list = []
for tag, cnt in data_dic.items():
sub_list.append([tag, cnt])
return pd.DataFrame(sub_list, columns=keys)
except Exception as e:
logging.error(f"dic2df error {traceback.format_exc()}")
return None
def calc_mean_freq(self, freq_list):
try:
if len(freq_list) == 1:
now = (datetime.now() - timedelta(days=0)).strftime('%Y%m%d')
return round(self.days(freq_list[0], now))
elif freq_list and len(freq_list) > 1:
freq_list.sort()
days_list = []
for i in range(len(freq_list)):
if i + 1 < len(freq_list):
dd = self.days(freq_list[i], freq_list[i + 1])
days_list.append(dd)
if len(days_list) > 0:
return round(math.ceil(sum(days_list) / len(days_list), ))
else:
return -1
else:
return -1
except Exception as e:
logging.error(f"calc_mean_freq error {freq_list} {traceback.format_exc()}")
return -1
def visit_day(self, row, visit_dic, col):
try:
visits = row[col]
if not (pd.isna(visits) or visits == 'None'):
# 不同日期访问的数量
# 访问日期频率
# 20230101|20221214
lis = visits.split("|")
mean_freq = self.calc_mean_freq(lis)
if mean_freq not in visit_dic['freq']:
visit_dic['freq'][mean_freq] = 0
visit_dic['freq'][mean_freq] += 1
times = len(lis)
if times not in visit_dic['times']:
visit_dic['times'][times] = 0
visit_dic['times'][times] += 1
except Exception as e:
logging.error(f"visit_day error {traceback.format_exc()}")
def cur_level(self, dic):
new_dic = {}
try:
for k, v in dic.items():
if k <= 1:
key = '1'
elif k <= 2:
key = '2'
elif k <= 5:
key = "5"
elif k <= 8:
key = '8'
elif k <= 10:
key = "10"
elif k <= 14:
key = "14"
elif k <= 18:
key = "18"
elif k <= 20:
key = "20"
elif k <= 30:
key = "30"
else:
key = "50"
if key not in new_dic:
new_dic[key] = 0
new_dic[key] += v
new_dic = dict(sorted(new_dic.items(), key=lambda item: int(item[0])))
except Exception as e:
logging.error(f"cur_level error {traceback.format_exc()}")
return new_dic
def plot_line(self, save_path, x_data, y_data_list, colors, title, rotation=90):
try:
# 图表大小
fig = plt.figure(figsize=(20, 16))
# 设置点大小
for i, y_data in enumerate(y_data_list):
# 画图
plt.plot(x_data, y_data, color=colors[i], marker='o', label='曝光', alpha=0.25)
# # 标记数值
for x, y in zip(x_data, y_data):
plt.text(x, y, y, ha='center', va='bottom', fontsize=14)
# 设置网格线
plt.grid(color='black', axis="y", linestyle='-.', linewidth=1, alpha=0.15)
# x轴斜向展示
plt.xticks(rotation=rotation)
# 坐标轴字体
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
# 设置坐标轴颜色
ax = plt.gca()
ax.spines['left'].set_color('black')
ax.spines['bottom'].set_color('black')
plt.title(title, fontsize=20)
# plt.show()
fig.savefig(save_path, dpi=400, facecolor="white")
except Exception as e:
logging.error(f"plot_line error {save_path} {traceback.format_exc()}")
def merge_click_df(self, data_dic, columns):
try:
tag_list = []
for tag, dd in data_dic.items():
sub_ = [tag, dd['曝光'], dd['点击'], dd['用户数']]
tag_list.append(sub_)
return pd.DataFrame(tag_list, columns=columns)
except Exception as e:
logging.error(f"merge_click_df error {data_dic} {traceback.format_exc()}")
return None
def get_path(self, name):
return f"{self.dir}/img/{name}"
def plot_df(self):
df = self.read_data(self.data_path, self.columns)
logging.info(f"df: {df.shape} {df.columns} \n {df.head()}")
city_dic = {}
df.apply(self.parse_gps, axis=1, args=(city_dic,))
logging.info(f"city_dic: {len(city_dic)} {city_dic}")
v1_dic, v2_dic, entity_dic, generate_en_dic, topic_dic, guid_dic = self.calc_data(df)
logging.info(f"v1_dic: {len(v1_dic)} {v1_dic}")
logging.info(f"v2_dic: {len(v2_dic)}")
logging.info(f"entity_dic: {len(entity_dic)}")
logging.info(f"generate_en_dic: {len(generate_en_dic)}")
logging.info(f"topic_dic: {len(topic_dic)}")
logging.info(f"guid_dic: {len(guid_dic)}")
ts_dic = {}
df.apply(self.subscribe, axis=1, args=(ts_dic, "关注",))
logging.info(f"ts_dic: {len(ts_dic)} {ts_dic}")
hour_df = self.dic2df(ts_dic, ['小时(关注)', '人数'], sort=True)
visit_dic = {
"freq": {},
"times": {}
}
df.apply(self.visit_day, axis=1, args=(visit_dic, "访问日期",))
logging.info(f"freq: {len(visit_dic['freq'])} {visit_dic['freq']}")
logging.info(f"times: {len(visit_dic['times'])} {visit_dic['times']}")
freq_dic = self.cur_level(visit_dic['freq'])
freq_df = self.dic2df(freq_dic, ['访问频率', '人数'], sort=False)
times_df = self.dic2df(visit_dic['times'], ['小时(访问)', '人数'], sort=True)
city_df = self.dic2df(city_dic, ['城市', '人数'], sort=True, sort_value=True)
logging.info(f"city_df: {city_df.shape} {city_df.head()}")
v1_df = self.merge_click_df(v1_dic, ["一级分类", "曝光", "点击", "用户数"])
v2_df = self.merge_click_df(v2_dic, ["二级分类", "曝光", "点击", "用户数"])
entity_df = self.merge_click_df(entity_dic, ["实体", "曝光", "点击", "用户数"])
generate_en_df = self.merge_click_df(generate_en_dic, ["泛化实体", "曝光", "点击", "用户数"])
topic_df = self.merge_click_df(topic_dic, ["话题", "曝光", "点击", "用户数"])
guid_df = self.merge_click_df(guid_dic, ["账号", "曝光", "点击", "用户数"])
img_path_dic = {
"一级分类": self.get_path("v1.png"),
"二级分类": self.get_path("v2.png"),
"实体": self.get_path("entity.png"),
"泛化实体": self.get_path("generate.png"),
"话题": self.get_path("topic.png"),
"账号": self.get_path("guid.png"),
"小时关注人数": self.get_path("hour.png"),
"访问频率": self.get_path("freq.png"),
"访问次数": self.get_path("times.png"),
"城市分布": self.get_path("city.png")
}
self.plot_line(img_path_dic['一级分类'], v1_df['一级分类'], [v1_df['曝光'], v1_df['点击'], v1_df['用户数']],
['red', 'blue', 'green'],
"一级分类: 曝光(红),点击(蓝),点击人数(绿)")
self.plot_line(img_path_dic['二级分类'], v2_df['二级分类'].iloc[:50],
[v2_df['曝光'].iloc[:50], v2_df['点击'].iloc[:50], v2_df['用户数'].iloc[:50]],
['red', 'blue', 'green'],
"二级分类: 曝光(红),点击(蓝),点击人数(绿)")
self.plot_line(img_path_dic['实体'], entity_df['实体'].iloc[:50],
[entity_df['曝光'].iloc[:50], entity_df['点击'].iloc[:50], entity_df['用户数'].iloc[:50]],
['red', 'blue', 'green'], "实体: 曝光(红),点击(蓝),点击人数(绿)")
self.plot_line(img_path_dic['泛化实体'], generate_en_df['泛化实体'].iloc[:50],
[generate_en_df['曝光'].iloc[:50], generate_en_df['点击'].iloc[:50], entity_df['用户数'].iloc[:50]],
['red', 'blue', 'green'], "泛化实体: 曝光(红),点击(蓝),点击人数(绿)")
self.plot_line(img_path_dic['话题'], topic_df['话题'].iloc[:50],
[topic_df['曝光'].iloc[:50], topic_df['点击'].iloc[:50], topic_df['用户数'].iloc[:50]],
['red', 'blue', 'green'], "话题: 曝光(红),点击(蓝),点击人数(绿)")
self.plot_line(img_path_dic['账号'], guid_df['账号'].iloc[:50],
[guid_df['曝光'].iloc[:50], guid_df['点击'].iloc[:50], guid_df['用户数'].iloc[:50]],
['red', 'blue', 'green'], "guid: 曝光(红),点击(蓝),点击人数(绿)")
self.plot_line(img_path_dic['小时关注人数'], hour_df['小时(关注)'], [hour_df['人数']], ['red'], "关注人数", rotation=0)
self.plot_line(img_path_dic['访问频率'], freq_df['访问频率'], [freq_df['人数']], ['red'], "访问频率", rotation=0)
self.plot_line(img_path_dic['访问次数'], times_df['小时(访问)'], [times_df['人数']], ['red'], "访问次数", rotation=0)
self.plot_line(img_path_dic['城市分布'], city_df['城市'], [city_df['人数']], ['red'], "城市分布", rotation=90)
all_cols_list = [
["画像明细", df],
['一级分类', v1_df],
['二级分类', v2_df],
['实体', entity_df],
['泛化实体', generate_en_df],
['话题', topic_df],
['账号', guid_df],
['小时关注人数', hour_df],
['访问频率', freq_df],
['访问次数', times_df],
['城市分布', city_df]
]
self.save_data_to_excel(self.save_path, all_cols_list, img_path_dic)
obj = ProfileUtils()
obj.plot_df()
数据格式
81890649 20221112 0.6179316052346939 中国|上海市|上海市|普陀区|大渡河路565号(长风公园地铁站2号口步行200米)|友缘茶人(大渡河路店) 生活|1|1 生活分享|1|1 4000000089719214312|1|1 8KkYtoHh2Jc|1|1 None None None iphone_16.1 iphone13_4,iphone 2022-11-18 23:14:39 7.58.1 None
76919736 20221227 0.6179316052346939 中国|北京市|北京市 文化|1|1 东方智慧|1|1 4000000089718963493|1|1 8M4LIgYEjmy|1|1 None None None android_10 m2007j22c,redmi 2023-01-02 22:43:38 7.60.0 None
80404085 20221125 0.6179316052346939 中国|上海市|上海市|嘉定区|张掖路| 文化|1|2 文化理论|1|2 4000000089719025918|1|2 8LEB2OXOCxs|1|3 None None None android_12 m2102k1c,xiaomi 2022-12-01 22:24:33 7.58.2 None
70407321 None None None None None None None None None 94088003|20230303 10:38:56,50478422|20230303 10:38:55,4000000087352550627|20230303 10:38:54 None None None None None
4000000089719227369 20230101|20221214 0.6262705367812866 中国|广东省|深圳市|龙岗区|兴龙路| 旅游|2|8,体育|1|12 旅游政策性资讯|2|2,体育娱乐八卦|1|3 4000000089719227369|2|4,40275996|1|1 8Ljco89fQmm|2|3,8L1FvIG3rRw|1|11 达州市|2|2,四川省|2|2 冬天应季游|2|2,冰雪游|2|2,温泉度假游|2|2 None android_12 lge-an00,honor 2023-01-03 11:13:39 7.61.0 None
9565171 20221121 0.6179316052346939 中国|广东省|中山市 情感|1|1 心灵鸡汤|1|1 87813829|1|1 8L6Xb96FvEm|1|2 None None None android_10 lio-an00,huawei 2022-11-27 23:24:59 7.58.2 None
4000000089716853269 None None None None None None None None None 4000000089716827825|20221228 09:17:04 None None None None None
4000000015087800646 None None None None None None None None None 4000000089717384490|20230227 15:56:00,80572276|20230101 11:29:01 None None None None None
4000000089718249026 20221227 0.6179316052346939 中国|内蒙古自治区|赤峰市|元宝山区|G16(丹锡高速)| 军事|1|1 军事知识|1|1 4000000089718249026|1|3 8LTtiqUpgGm|1|1 None None 88407282|20230315 21:23:17,4000000089718514440|20230308 04:23:18,4000000089719449903|20230306 09:53:16,73812971|20230224 00:18:12,97499832|20230222 22:11:21,74526440|20230213 15:04:46,94297008|20230203 20:40:05,93545605|20230201 16:40:30,4000000089719077377|20230123 18:06:33,78575652|20230123 18:04:09,4000000089718631016|20230121 18:15:06,4000000089718926745|20230113 11:45:35,4000000089717533288|20230108 19:04:20,4000000089716849322|20230108 18:54:49,4000000009675440627|20230103 01:03:18,4000000087958970623|20230101 16:39:17,4000000089366840623|20221222 21:24:08,67962554|20221218 18:30:04,40275996|20221215 20:53:24,4000000089716876908|20221214 19:44:27,4000000046052300623|20221213 17:21:16,4000000089718868282|20221206 17:55:54,4000000089717609772|20221206 09:40:10,4000000089717505210|20221205 22:44:57,4000000089719271110|20221201 19:02:30,93425221|20221130 13:13:17,4000000089719273042|20221130 02:03:53,75924071|20221129 20:16:40,4000000089717557380|20221124 21:40:26,61980778|20221123 18:17:15,86531178|20221122 09:51:56,4000000075239060628|20221121 19:02:44,4000000089719243384|20221119 23:18:21,94192268|20221119 08:52:43,4000000089717578344|20221118 12:30:55,4000000089718915089|20221118 12:26:42,4000000089718831949|20221115 06:32:14,21534852|20221113 17:54:07,4000000089719215032|20221113 11:46:22,4000000087352550627|20221110 08:52:20,85354052|20221109 20:58:27,4000000047203130623|20221109 19:34:24,4000000089719094567|20221109 19:15:06,4000000089719150656|20221107 00:11:51,4000000089719174178|20221031 09:03:02,4000000089718532186|20221024 16:23:45,4000000089716816483|20221022 10:39:45,4000000089718754511|20221019 15:07:59,4000000006494000623|20221014 19:41:37,92240778|20221011 16:02:16,4000000089719113390|20221009 23:22:40,4000000051390650623|20221005 21:05:24,4000000089717544773|20220930 18:08:46,4000000025878980624|20220930 18:08:42,4000000089717610763|20220930 18:08:01,4000000089717261533|20220929 13:26:11,4000000064983110623|20220929 09:03:13,4000000051766160624|20220928 20:31:00,4000000089718528657|20220928 20:30:11,97518683|20220928 12:38:52,94367337|20220927 20:05:04,4000000049817060623|20220926 17:54:11,4000000086140400625|20220923 17:38:59,4000000089719064228|20220923 12:13:07,97518705|20220923 12:12:03,4000000030670280633|20220922 17:57:49,4000000086773280625|20220921 19:22:34,4000000089718987397|20220919 22:49:12,4000000089717599001|20220915 22:55:41,4000000089719020521|20220910 21:38:08,4000000089718199219|20220909 12:41:23,48954335|20220906 14:14:06,91459300|20220904 12:53:39,4000000024873620623|20220902 18:08:22,4000000089718584953|20220609 11:15:17,4000000089717639139|20220609 11:15:11,4000000089716613916|20220309 18:31:50,4000000089717630531|20220302 11:46:23 android_12 ana-an00,huawei 2023-01-02 22:53:30 7.61.0 None
4000000089719137203 None None None None None None None None None 76279380|20230105 16:58:54 None None None None None
4000000089719281486 20221222 0.6218582440451138 中国|上海市|上海市 时政|1|1,社会|1|3 灾情播报及预报|1|1,东亚和环太平洋时政|1|1 87813829|1|1,4000000089719060681|1|3 8Lx0VBH5wv2|1|1,8Lx7sCa3Wim|1|1 None None None android_10 hma-al00,huawei 2023-01-03 19:03:46 7.60.0 None
4000000029094140626 None None None None None None None None None 4000000030829220624|20221204 05:46:40,4000000056781080624|20200609 05:26:38,95409948|20190718 11:56:37 None None None None None
4000000061880780623 20221127 0.6179316052346939 中国|北京市|北京市|东城区|东长安街1号|东方新天地 文化|1|1 文化理论|1|1 4000000089719025918|1|1 8LEB2OXOCxs|1|2 None None None iphone_15.5 iphone14_2,iphone 2022-12-03 22:14:30 7.58.2 None
4000000089719408157 20230101 0.6179316052346939 文化|1|1 解读小说|1|1 4000000089719237515|1|1 8Kr9qtFToVE|1|1,8KoB3WyCx60|1|1 None None None iphone_13.6 iphone9_2,iphone 2023-01-04 22:43:29 7.61.0 None
4000000089719072754 20230103|20230101|20221230|20221228|20221227|20221224 0.6448956506309339 中国|河南省|洛阳市 社会|5|22,三农|1|1,其他|1|5 城市与乡村建设|3|6,三农政策|1|1,政府在行动|1|2,正能量|1|2 4000000089719072754|7|24 8LsIB6nCVBg|1|3,8Kpo25R4FDk|1|2 肖亚庆|1|3 影视剧点评|1|6 None android_13 sm-s9080,samsung 2023-01-04 08:53:45 7.61.0 None
4000000006327020631 20221225 0.6179316052346939 中国|四川省|成都市 生活|1|1 节日祝福(祝福)|1|1 4000000089718963493|1|1 8LyVfEkHMsy|1|1 None None None iphone_14.7.1 iphone13_1,iphone 2022-12-31 22:43:46 7.60.0 None
更多推荐
所有评论(0)