2024-08-18

使用Pandas做数据分析

读取csv文件

import os
import pandas as pd
from sklearn import tree

def read_data(data_dir):
    data = pd.DataFrame(columns = ['ids', 'cpu', 'mem', "disk"])
    for f in os.listdir(data_dir):
        t = pd.read_csv(os.path.join(data_dir, f))
        data.loc[data.index.size] = [
            int(f.split(".",1)[0]),
            t["cpu"].mean(),
            t["memory"].mean(),
            t["disk"].mean()
        ]

    return data.sort_values(by="ids")

设置显示参数

#设置显示参数
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 20)
pd.set_option('display.precision', 0)

数据清洗

# 归一化
dataFrame['xxxx'] = dataFrame['xxxx'].apply(lambda x: re.sub(a, b, x))

# 预处理
dataFrame['xxxx'].mask(dataFrame['xxxx']=='null', 'NULL', inplace=True)

# 去除异常数据
dataFrame.mask(dataFrame.isin(["null"]), inplace=True)
dataFrame.dropna(axis=0, how='any')

数据分析接口

数据透视表

1 2	# 数据透视表 pd.pivot_table(dataFrame, index=["Index"],values=["Values"],columns="column",aggfunc=[len],margins=True,fill_value=0)

画图

1 2	# 版本分布饼图 dataFrame['version'].value_counts().plot(kind='pie', autopct='%.2f%%')

Skippor's Blog

使用Pandas做数据分析

读取csv文件

设置显示参数

数据清洗

数据分析接口

参考