基于Python的大数据分析基础（二）--- Pandas及其主要数据结构

mac2025-06-19 4

1.Pandas简介

Pandas是一个Python包，提供快速、灵活和富有表现力的数据结构，使关联或标记数据的使用既简单又直观。

它旨在成为Python中进行实际，真实世界数据分析的基础高级构建块。此次外还有更广泛的目标，即称为任何语言中最强大，最灵活的开源数据分析/操作工具。

适合许多不同类型的数据

具有异构类型列的表格数据，如SQL表或Excel表有序和无序的时间序列数据具有行和列标签的任意矩阵数据任何其他形式的观察/统计数据集。实际上不需要将数据标记为放置在pandas数据结构中

2.Pandas中的数据结构

Pandas 中除了 Panel 数据结构，还引入了两种新的数据结构一－Series和 DataFrame ，这两种数据结构都建立在 NumPy 的基础之上。

( 1 ) Series ：一维数组系列，也称序列，与 Numpy中的一维 array 类似。二者与Python 基本的数据结构 list 也很相近。

( 2 ) DataFrame ：二维的表格型数据结构。可以将DataFrame 理解为 Series 的容器。以下的内容主要以DataFrame为主。

( 3) Panel：三维数组，可以理解为 DataFrame 的容器。

3.数据结构

3.1 Series 一维数组 <==> array，list

# -*- coding: UTF-8 -*- ''' @Author ：Jason Pandas 的数据结构：Pandas 主要有 Series（一维数组），DataFrame（二维数组），Panel（三维数组），Panel4D（四维数组），PanelND（更多维数组）等数据结构。其中 Series 和 DataFrame 应用的最为广泛 ''' import pandas as pd import numpy as np #创建Series数据类型的三种方法 def createSeries(): #1.列表创建 list1 = [1,2,3,4,5] s1 = pd.Series(list1) #如果不指定索引，默认从0开始 print(s1) ''' 0 1 1 2 2 3 3 4 dtype: int64 ''' #2.从加入所以创建Series list2 = np.random.randn(5) #随机五位小数 indexList = ["a","b","c","d","e"] s2 = pd.Series(list2,index=indexList) print(s2) ''' a 3.198744 b 0.214999 c -0.272966 d 0.089220 e -0.895139 dtype: float64 ''' #3.从字典创建 dict1 = {"a":1,"b":2,"c":3,"d":4,"e":5} s3 = pd.Series(dict1) print(s3) ''' a 1 b 2 c 3 d 4 e 5 dtype: int64 ''' def seriesBaseOperate(): list1 = [1,2,3,4,5] s1 = pd.Series(list1) s1.index = ["A","B","C","D","E"] #将下标12345，改为ABCDE print(s1) ''' A 1 B 2 C 3 D 4 E 5 dtype: int64 ''' list2 = np.random.randn(5) indexList = ["a", "b", "c", "d", "e"] s2 = pd.Series(list2,index=indexList) print(s2) ''' a -1.273575 b -1.074655 c -0.772257 d 0.694503 e 1.254038 dtype: float64 ''' d = {"a": 1, "b": 2, "c": 3, "d": 4, "e": 5} s3 = pd.Series(d) s4 = s3.append(s1) # 将s1拼接到s3,注意不能拼接单个元素 print(s4) ''' a 1 b 2 c 3 d 4 e 5 A 1 B 2 C 3 D 4 E 5 dtype: int64 ''' s4 = s4.drop("e") # 删除索引为e的值 s4["A"] = "JASON" # 修改索引A的值 print(s4["A"]) # 查值 print(s4[0:3]) # 切片 ''' JASON a 1 b 2 c 3 dtype: object ''' #Series运算操作 def operatingSeries(): list1 = [1,2,3,4,5] s1 = pd.Series(list1) n = np.random.randn(5) indexList = ["a","b","c","d","e"] indexList1 = [1,2,3,4,5] s2 = pd.Series(n,index = indexList1) print(s1.add(s2)) #Series 的加法运算是按照索引计算，如果索引不同则填充为 `NaN`（空值） ''' 0 NaN 1 1.392250 2 2.856717 3 4.037887 4 3.496885 5 NaN dtype: float64 ''' print(s1.sub(s2)) ''' 0 NaN 1 2.607750 2 3.143283 3 3.962113 4 6.503115 5 NaN dtype: float64 ''' print(s1.mul(s2)) ''' 0 NaN 1 -1.215499 2 -0.429849 3 0.151548 4 -7.515576 5 NaN dtype: float64 ''' print(s1.div(s2)) ''' 0 NaN 1 -3.290829 2 -20.937609 3 105.577265 4 -3.326425 5 NaN dtype: float64 ''' print(s1.median()) #中位数 3.0 print(s1.max()) # 5 print(s1.sum()) # 15 if __name__ == "__main__": # createSeries() # seriesBaseOperate() operatingSeries()

3.2 DataFrame 二维数组 <===> ndarray

# -*- coding: UTF-8 -*- ''' 二维数组DataFrame <==> ndarray ''' import pandas as pd import numpy as np from pandas import Series class DF(object): def __init__(self): dates = pd.date_range("today", periods=6) # 定义时间序列作为index num_arr = np.random.randn(6, 4) # 传入nunpy的随机小数数组 colnum = ["A", "B", "C", "D"] self.df1 = pd.DataFrame(data=num_arr, index=dates, columns=colnum) self.dataInfo = {'animal': ['cat', 'cat', 'snake', 'dog', 'dog', 'cat', 'snake', 'cat', 'dog', 'dog'], 'age': [2.5, 3, 0.5, np.nan, 5, 2, 4.5, np.nan, 7, 3], 'visits': [1, 3, 2, 3, 2, 3, 1, 1, 2, 1], 'priority': ['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no']} labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'] self.df2 = pd.DataFrame(data=self.dataInfo, index=labels) def createDataFrame(self): print(self.df1) ''' A B C D 2018-12-05 19:26:33.079050 1.238997 1.178291 0.682551 0.083252 2018-12-06 19:26:33.079050 -0.682866 0.168864 -0.702818 -1.183783 2018-12-07 19:26:33.079050 -0.042540 1.595481 -0.157319 -1.531944 2018-12-08 19:26:33.079050 -1.397062 -0.853874 -0.365774 -0.865814 2018-12-09 19:26:33.079050 0.997983 -0.871088 0.856143 -0.322108 2018-12-10 19:26:33.079050 0.134739 -0.886856 -0.731986 -0.975596 ''' print(self.df2) ''' animal age visits priority a cat 2.5 1 yes b cat 3.0 3 yes c snake 0.5 2 no d dog NaN 3 yes e dog 5.0 2 no f cat 2.0 3 no g snake 4.5 1 no h cat NaN 1 yes i dog 7.0 2 no ''' print(self.df2.shape) # 属性(10,4) #10行4列 def dataFrame(self): ''' dataFrmme的各种方法 :return: None ''' # print(self.df1) # print(self.df2.head()) #1.head(n) 查看前几个值,默认前5 ''' animal age visits priority a cat 2.5 1 yes b cat 3.0 3 yes c snake 0.5 2 no d dog NaN 3 yes e dog 5.0 2 no ''' # print(self.df2.tail(3)) #查看后3个的值 ''' animal age visits priority h cat NaN 1 yes i dog 7.0 2 no j dog 3.0 1 no ''' # print(self.df2.columns) #2.查看列名 '''Index(['animal', 'age', 'visits', 'priority'], dtype='object')''' # print(self.df2.values) #查看值 ''' [['cat' 2.5 1 'yes'] ['cat' 3.0 3 'yes'] ['snake' 0.5 2 'no'] ['dog' nan 3 'yes'] ['dog' 5.0 2 'no'] ['cat' 2.0 3 'no'] ['snake' 4.5 1 'no'] ['cat' nan 1 'yes'] ['dog' 7.0 2 'no'] ['dog' 3.0 1 'no']] ''' # print(self.df2.index) #查看索引 ''' Index(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'], dtype='object') ''' # print(self.df2.describe()) #3.查看数据统计 ''' age visits count 8.000000 10.000000 mean 3.437500 1.900000 std 2.007797 0.875595 min 0.500000 1.000000 25% 2.375000 1.000000 50% 3.000000 2.000000 ''' # print(self.df2.T)#4.转置操作 ''' a b c d e f g h i j animal cat cat snake dog dog cat snake cat dog dog age 2.5 3 0.5 NaN 5 2 4.5 NaN 7 3 visits 1 3 2 3 2 3 1 1 2 1 priority yes yes no yes no no no yes no no ''' # print(self.df2['age']) #5.通过标签查询 ''' a 2.5 b 3.0 c 0.5 d NaN e 5.0 f 2.0 g 4.5 h NaN i 7.0 j 3.0 Name: age, dtype: float64 ''' # print(self.df2.iloc[1:3]) #6.查询2，3行 ''' animal age visits priority b cat 3.0 3 yes c snake 0.5 2 no ''' # print(self.df2.iat[1,0]) #7.按照坐标查询 2行1列的数据 ''' cat ''' # print(self.df2.loc["f","age"])#8.按照标签和索引 f行标签为age的数据 ''' 2.0 ''' #9.添加列数据(先创建一个Series,然后添加) df3 = self.df2 num = Series([0,1,2,3,4,5,6,7,8,9],index=df3.index) df3['No.'] = num #添加以No.为列名的新数据 # print(df3) ''' animal age visits priority No. a cat 2.5 1 yes 0 b cat 3.0 3 yes 1 c snake 0.5 2 no 2 d dog NaN 3 yes 3 e dog 5.0 2 no 4 f cat 2.0 3 no 5 g snake 4.5 1 no 6 h cat NaN 1 yes 7 i dog 7.0 2 no 8 j dog 3.0 1 no 9 ''' #10.删除数据 df22 = self.df2 df23 = df22.drop("age",axis=1) #delete table where yid = "age" # print(df23) ''' nimal visits priority No. a cat 1 yes 0 b cat 3 yes 1 c snake 2 no 2 d dog 3 yes 3 e dog 2 no 4 f cat 3 no 5 g snake 1 no 6 h cat 1 yes 7 i dog 2 no 8 j dog 1 no 9 ''' df24 = df22.drop(["age","No."],axis=1)#相当于delete table where yid = "age" or yid = "No." # print(df24) ''' animal visits priority a cat 1 yes b cat 3 yes c snake 2 no d dog 3 yes e dog 2 no f cat 3 no g snake 1 no h cat 1 yes i dog 2 no j dog 1 no ''' df25 = df22.dropna(how="any") #how参数any,只要有缺失值就删除 # print(df25) ''' animal age visits priority No. a cat 2.5 1 yes 0 b cat 3.0 3 yes 1 c snake 0.5 2 no 2 e dog 5.0 2 no 4 f cat 2.0 3 no 5 g snake 4.5 1 no 6 i dog 7.0 2 no 8 j dog 3.0 1 no 9 ''' #11.缺失值处理(填充) df4 = df3.fillna(value=3) #值为Nan地方填充为3,可以df3对比，返回值，不能直接改变df3 # print(df4) ''' animal age visits priority No. a cat 2.5 1 yes 0 b cat 3.0 3 yes 1 c snake 0.5 2 no 2 d dog 3.0 3 yes 3 e dog 5.0 2 no 4 f cat 2.0 3 no 5 g snake 4.5 1 no 6 h cat 3.0 1 yes 7 ''' #12.条件查找 # print(self.df2[self.df2["age"] < 3]) #注意Nan不参与计算 ''' animal age visits priority No. a cat 2.5 1 yes 0 c snake 0.5 2 no 2 f cat 2.0 3 no 5 ''' # print(self.df2[(self.df2["animal"] == "cat") & (self.df2["age"]> 2)]) ''' animal age visits priority No. a cat 2.5 1 yes 0 b cat 3.0 3 yes 1 ''' # print(self.df2[self.df2["animal"].isin(["cat","dog"])]) #animal索引中包含cat,dog的 ''' animal age visits priority No. a cat 2.5 1 yes 0 b cat 3.0 3 yes 1 d dog NaN 3 yes 3 e dog 5.0 2 no 4 f cat 2.0 3 no 5 h cat NaN 1 yes 7 i dog 7.0 2 no 8 j dog 3.0 1 no 9 ''' #13.行列索引切片 # print(self.df2.iloc[2:4,1:3]) #2-4行，1-3列的数据 ''' age visits c 0.5 2 d NaN 3 ''' #14.排序操作 # print(self.df2.sort_values(by=["age","visits"],ascending=[False,True])) #根据age降序，visits升序排列,根据sql经验会报错，这里竟然没 ''' animal age visits priority No. i dog 7.0 2 no 8 e dog 5.0 2 no 4 g snake 4.5 1 no 6 j dog 3.0 1 no 9 b cat 3.0 3 yes 1 a cat 2.5 1 yes 0 f cat 2.0 3 no 5 c snake 0.5 2 no 2 h cat NaN 1 yes 7 d dog NaN 3 yes 3 ''' #15.DataFrame 多值替换 # print(self.df2["priority"].map({"yes":1,"no":0})) #将yes替换为1，no替换为0 ''' a 1 b 1 c 2 d 1 e 2 f 2 g 2 h 1 i 2 j 2 Name: priority, dtype: int64 ''' #16.分组操作 print(self.df2.groupby("animal").sum()) ''' age visits No. animal cat 7.5 8 13 dog 15.0 8 24 snake 5.0 3 8 ''' if __name__ == "__main__": df = DF() # df.createDataFrame() df.dataFrame()

参照:《基于Python的大数据分析基础及实战》

最新回复(0)