Pandas 是 Python 中用于数据分析的核心库之一,其中 DataFrame 是最常用的数据结构,类似于 Excel 表格或 SQL 表。下面我们将从 创建、属性、索引、筛选、统计、排序 等多个维度全面介绍 DataFrame 的基本用法。
# dataframe的创建方式 import pandasas pdimport numpyas np# 通过series来创建 s1= pd. Series( [ 1 , 2 , 3 , 4 , 5 ] ) s2= pd. Series( [ 6 , 7 , 8 , 9 , 10 ] ) df= pd. DataFrame( { "第一列" : s1, "第二列" : s2} ) type ( df[ "第一列" ] ) # 通过字典来创建 df= pd. DataFrame( { "id" : [ 1 , 2 , 3 , 4 , 5 ] , "name" : [ "Tom" , "jack" , "alice" , "bob" , "allen" ] , "age" : [ 15 , 17 , 20 , 26 , 30 ] , "score" : [ 60.5 , 80 , 30.6 , 70 , 83.5 ] } , index= [ 1 , 2 , 3 , 4 , 5 ] , columns= [ "name" , "id" , "age" , "score" ] ) dfname id age score 1 Tom 1 15 60.5 2 jack 2 17 80.0 3 alice 3 20 30.6 4 bob 4 26 70.0 5 allen 5 30 83.5
# dataframe的属性 print ( '行索引:' ) print ( df. index) print ( '列标签:' ) print ( df. columns) print ( '值:' ) print ( df. values) 行索引: Index([1, 2, 3, 4, 5], dtype='int64') 列标签: Index(['name', 'id', 'age', 'score'], dtype='object') 值: [['Tom' 1 15 60.5] ['jack' 2 17 80.0] ['alice' 3 20 30.6] ['bob' 4 26 70.0] ['allen' 5 30 83.5]]print ( '维度:' , df. ndim) print ( '数据类型:' ) print ( df. dtypes) print ( '形状:' , df. shape) print ( '元素个数:' , df. size) 维度: 2 数据类型: name object id int64 age int64 score float64 dtype: object 形状: (5, 4) 元素个数: 20# 行列转置 print ( df. T) 1 2 3 4 5 name Tom jack alice bob allen id 1 2 3 4 5 age 15 17 20 26 30 score 60.5 80.0 30.6 70.0 83.5# 获取元素 loc iloc at iat # 某行 print ( df. loc[ 4 ] ) print ( df. iloc[ 3 ] ) name bob id 4 age 26 score 70.0 Name: 4, dtype: object name bob id 4 age 26 score 70.0 Name: 4, dtype: object# 某列 print ( df. loc[ : , 'name' ] ) print ( df. iloc[ : , 0 ] ) 1 Tom 2 jack 3 alice 4 bob 5 allen Name: name, dtype: object 1 Tom 2 jack 3 alice 4 bob 5 allen Name: name, dtype: object# 单个元素 print ( df. at[ 3 , 'score' ] ) print ( df. iat[ 2 , 1 ] ) print ( df. loc[ 3 , 'score' ] ) print ( df. iloc[ 2 , 1 ] ) 30.6 3 30.6 3# 获取单列数据 print ( df[ 'name' ] ) print ( type ( df[ 'name' ] ) ) print ( df. name) 1 Tom 2 jack 3 alice 4 bob 5 allen Name: name, dtype: object <class 'pandas.core.series.Series'> 1 Tom 2 jack 3 alice 4 bob 5 allen Name: name, dtype: objectprint ( df[ [ 'name' , 'score' ] ] ) #多列数据的获取 name score 1 Tom 60.5 2 jack 80.0 3 alice 30.6 4 bob 70.0 5 allen 83.5# 查看部分数据 print ( df. head( 2 ) ) print ( df. tail( 3 ) ) name id age score 1 Tom 1 15 60.5 2 jack 2 17 80.0 name id age score 3 alice 3 20 30.6 4 bob 4 26 70.0 5 allen 5 30 83.5# 使用布尔索引筛选数据 df[ df. score> 70 ] df[ ( df. score> 70 ) & ( df. age< 20 ) ] name id age score 2 jack 2 17 80.0
# 随机抽样 df. sample( 3 ) name id age score 4 bob 4 26 70.0 1 Tom 1 15 60.5 2 jack 2 17 80.0
print ( df. isin( [ 'jack' , 20 ] ) ) # 查看元素是否包含在参数集合中 name id age score 1 False False False False 2 True False False False 3 False False True False 4 False False False False 5 False False False Falseprint ( df. isna( ) ) # 查看元素是否是缺失值 name id age score 1 False False False False 2 False False False False 3 False False False False 4 False False False False 5 False False False Falseprint ( df[ 'score' ] . sum ( ) ) # 某一列的总和 print ( df. score. max ( ) ) # 最大值 print ( df. age. min ( ) ) # 最小值 print ( df. score. mean( ) ) # 平均值 print ( df. score. median( ) ) # 中位数 print ( df. age. mode( ) ) # 众数 print ( df. score. std( ) ) # 标准差 print ( df. score. quantile( 0.25 ) ) # 分位数 print ( df. describe( ) ) 324.6 83.5 15 64.92 70.0 0 15 1 17 2 20 3 26 4 30 Name: age, dtype: int64 21.188605428390044 60.5 id age score count 5.000000 5.000000 5.000000 mean 3.000000 21.600000 64.920000 std 1.581139 6.268971 21.188605 min 1.000000 15.000000 30.600000 25% 2.000000 17.000000 60.500000 50% 3.000000 20.000000 70.000000 75% 4.000000 26.000000 80.000000 max 5.000000 30.000000 83.500000print ( df. count( ) ) # 返回每一列非缺失值的个数 name 5 id 5 age 5 score 5 dtype: int64print ( df. value_counts( ) ) # 出现的次数 name id age score Tom 1 15 60.5 1 alice 3 20 30.6 1 allen 5 30 83.5 1 bob 4 26 70.0 1 jack 2 17 80.0 1 Name: count, dtype: int64print ( df. drop_duplicates( ) ) name id age score 1 Tom 1 15 60.5 2 jack 2 17 80.0 3 alice 3 20 30.6 4 bob 4 26 70.0 5 allen 5 30 83.5print ( df. duplicated( subset= [ 'age' ] ) ) # 查看是否重复 1 False 2 False 3 False 4 False 5 False dtype: booldf. sample( 2 ) # 随机抽样 name id age score 1 Tom 1 15 60.5 2 jack 2 17 80.0
print ( df. replace( 15 , 30 ) ) name id age score 1 Tom 1 30 60.5 2 jack 2 17 80.0 3 alice 3 20 30.6 4 bob 4 26 70.0 5 allen 5 30 83.5df. cumsum( ) df. cummax( ) df. cummin( axis= 0 ) name id age score 1 Tom 1 15 60.5 2 Tom 1 15 60.5 3 Tom 1 15 30.6 4 Tom 1 15 30.6 5 Tom 1 15 30.6
print ( df. sort_index( ascending= False ) ) name id age score 5 allen 5 30 83.5 4 bob 4 26 70.0 3 alice 3 20 30.6 2 jack 2 17 80.0 1 Tom 1 15 60.5print ( df. sort_values( by= [ 'score' , 'age' ] ) ) name id age score 3 alice 3 20 30.6 1 Tom 1 15 60.5 4 bob 4 26 70.0 2 jack 2 17 80.0 5 allen 5 30 83.5df. nlargest( 2 , columns= [ 'score' , 'age' ] ) df. nsmallest( 2 , columns= [ 'score' , 'age' ] ) name id age score 3 alice 3 20 30.6 1 Tom 1 15 60.5