pandas?国宝大熊猫😯?不,是python数据分析核心包!!!
那么它有哪些迷人之处?咱们边学习边点赞😛
一,首先当然是配置环境
直接在windows的dos窗口或pycharm下面的Terminal中输入如下命令即可
pip install pandas
二,数据结构Series👇
==Series是一种类似于一维数组的对象,由一组数据和一组与之对应的标签(索引)组成 ==
1,引用方法
import pandas as pd
import numpy as np
2,创建Series
常见有以下几种创建方式
a = pd.Series([3, 4, 5, 6])
print(a)
# 0 3
# 1 4
# 2 5
# 3 6
# dtype: int64
b = pd.Series([4, 5, 6, 7], index=['a', 'b', 'c', 'd'])
print(b)
# a 4
# b 5
# c 6
# d 7
# dtype: int64
a = pd.Series(np.arange(5, 10))
print(a)
# 0 5
# 1 6
# 2 7
# 3 8
# 4 9
# dtype: int32
a = pd.Series({
'a': 4, 'c': 7, 'g': 2, 'd': 9})
print(a)
# a 4
# c 7
# g 2
# d 9
# dtype: int64
3,Series的索引
b = pd.Series([4, 5, 6, 7], index=['a', 'b', 'c', 'd'])
print(b)
# 既可以用下标索引,也可以用标签索引
print(b[1]) # 5 下表索引
print(b['a']) # 4 标签索引
print(b[[0, 2]])
# a 4
# c 6
# dtype: int64
print(b[['c', 'd']])
# c 6
# d 7
# dtype: int64
print(b[b > 6])
# d 7
# dtype: int64
4,运算
b = pd.Series([4, 5, 6, 7], index=['a', 'b', 'c', 'd'])
print(b + 2)
# a 6
# b 7
# c 8
# d 9
# dtype: int64
print(b + b)
# a 8
# b 10
# c 12
# d 14
# dtype: int64
print(b * b)
# a 16
# b 25
# c 36
# d 49
# dtype: int64
5,切片
a = pd.Series(np.arange(3, 8), ['a', 'b', 'c', 'd', 'e'])
print(a[2: 4])
# 2 5
# 3 6
# dtype: int32
print(a['a': 'c'])
# a 3
# b 4
# c 5
# dtype: int32
6,用标签查看series对象中是否含有某值
a = pd.Series({
'w': 10, 't': 34, 'p': 26})
print(a)
print('a' in a) # False
print('p' in a) # True
7,与字典的区别
a = pd.Series({
'w': 10, 't': 34, 'p': 26})
for i in a:
print(i, end=' ') # 10 34 26
即字典默认取键,series默认取值
8,获取索引和值
a = pd.Series({
'w': 10, 't': 34, 'p': 26})
print(a.index) # Index(['w', 't', 'p'], dtype='object')
print(a.values) # [10 34 26]
9,Series中有点迷惑行为的整数索引
s = pd.Series(np.arange(10))
print(s)
# 0 0
# 1 1
# 2 2
# 3 3
# 4 4
# 5 5
# 6 6
# 7 7
# 8 8
# 9 9
# dtype: int32
s1 = s[6:]
print(s1)
# 6 6
# 7 7
# 8 8
# 9 9
# dtype: int32
# print(s1[1]) # 会报错, 也有歧义
# 推荐以后如下方式使用
print(s1.loc[6: 8]) # 标签索引
# 6 6
# 7 7
# 8 8
# dtype: int32
print(s1.iloc[0: 2]) # 下表索引
# 6 6
# 7 7
# dtype: int32
即在series中取索引以后建议使用 loc[]标签索引,iloc[]下标索引
10,数据对齐
s1 = pd.Series([13, 45, 36, 24], index=['c', 'd', 'b', 'a'])
s2 = pd.Series([26, 38, 19, 44], index=['b', 'a', 'd', 'c'])
print(s1 + s2)
# a 62
# b 62
# c 57
# d 64
# dtype: int64
s3 = pd.Series([13, 45, 36, 24, 56], index=['c', 'f', 'b', 'a', 'e'])
print(s1 + s3) # pandas中NAN表示缺失值
# a 48.0
# b 72.0
# c 26.0
# d NaN
# e NaN
# f NaN
# dtype: float64
# add, sub, div, mul 加减乘除
print(s1.add(s3))
# a 48.0
# b 72.0
# c 26.0
# d NaN
# e NaN
# f NaN
# dtype: float64
print(s1.add(s3, fill_value=0)) # 将缺失值变为0再相加
# a 48.0
# b 72.0
# c 26.0
# d 45.0
# e 56.0
# f 45.0
# dtype: float64
11,series的缺失值处理
s1 = pd.Series([13, 45, 36, 24], index=['c', 'd', 'b', 'a'])
s3 = pd.Series([13, 45, 36, 24, 56], index=['c', 'f', 'b', 'a', 'e'])
s = s1 + s3
# a 48.0
# b 72.0
# c 26.0
# d NaN
# e NaN
# f NaN
# dtype: float64
# 判断某一行是不是缺失值
print(s.isnull())
# a False
# b False
# c False
# d True
# e True
# f True
# dtype: bool
print(s.notnull())
# a True
# b True
# c True
# d False
# e False
# f False
# dtype: bool
# 过滤缺失值
# ①扔掉缺失值
print(s[s.notnull()])
# a 48.0
# b 72.0
# c 26.0
# dtype: float64
print(s.dropna())
# a 48.0
# b 72.0
# c 26.0
# dtype: float64
# ②保留缺失值
# 将缺失值重新赋值
print(s.fillna(0)) # 将缺失值赋为0
# a 48.0
# b 72.0
# c 26.0
# d 0.0
# e 0.0
# f 0.0
# dtype: float64
# 将将缺失值填充为平均值
print(s.fillna(s.mean()))
# a 48.000000
# b 72.000000
# c 26.000000
# d 48.666667
# e 48.666667
# f 48.666667
# dtype: float64
print(s.fillna(round(s.mean(), 2))) # 保留两位小数(四舍五入)
# a 48.00
# b 72.00
# c 26.00
# d 48.67
# e 48.67
# f 48.67
# dtype: float64
三,数据结构dataframe👇
dataframe是一个表格型的数据结构,含有一组有序的列
1,DataFrame创建
df = pd.DataFrame({
'name': ['hpl', 'laowang', 'xpz'], 'age': [18, 21, 45]})
print(df)
# name age
# 0 hpl 18
# 1 laowang 21
# 2 xpz 45
df1 = pd.DataFrame({
'name': ['hpl', 'laowang', 'xpz'], 'age': [18, 21, 45]}, index=['a', 'b', 'c'])
print(df1)
# name age
# a hpl 18
# b laowang 21
# c xpz 45
df2 = pd.DataFrame({
'name': pd.Series(['hpl', 'lw', 'xpz'], index=['a', 'b', 'c']),
'age': pd.Series([18, 23, 45, 22], index=['a', 'b', 'c', 'd'])})
print(df2)
# name age
# a hpl 18
# b lw 23
# c xpz 45
# d NaN 22
2,DataFrame的常用属性
df = pd.DataFrame({
'name': pd.Series(['hpl', 'lw', 'xpz'], index=['a', 'b', 'c']),
'age': pd.Series([18, 23, 45, 22], index=['a', 'b', 'c', 'd'])})
print(df)
# name age
# a hpl 18
# b lw 23
# c xpz 45
# d NaN 22
print(df.index) # Index(['a', 'b', 'c', 'd'], dtype='object') 返回行索引
print(df.values) # 返回值
# [['hpl' 18]
# ['lw' 23]
# ['xpz' 45]
# [nan 22]]
print(df.T) # 转置
# a b c d
# name hpl lw xpz NaN
# age 18 23 45 22
print(df.columns) # Index(['name', 'age'], dtype='object') 返回列索引
print(df.describe()) # 返回描述
# age
# count 4.000000 个数
# mean 27.000000 平均值
# std 12.192894 标准差
# min 18.000000 最小值
# 25% 21.000000 1/4位上的数
# 50% 22.500000 中位数
# 75% 28.500000 3/4位上的数
# max 45.000000 最大值
3,索引和切片
data = pd.read_csv('vegetables.csv', header=None) # 读取文件,后面会详细讲解
print(data.head(6)) # 读取前六行数据
print(data[0]) # 读取第0列
print(data[0][1]) # 读取第0列第1行的元素(不推荐)
print(data.loc[1, 2]) # 读取第1行第2列的元素(推荐)
print(data.loc[: 3, :4])
# 0 1 2 3 4
# 0 沙蜜托 20.0 22.5 25.0 樱桃类
# 1 羊上脑 42.0 42.5 43.0 羊肉类
# 2 羊骨头 11.0 11.5 12.0 羊肉类
# 3 羊腩 32.0 32.0 32.0 羊肉类
print(data[:3]) # 读取前三行
4,数据对齐和数据缺失处理
data = pd.read_csv('test1.csv')
# print(data)
# name age
# 0 hpl 18.0
# 1 lw NaN
# 2 xpz 45.0
# 3 NaN 22.0
print(data['age'] + data['age'])
# 0 36.0
# 1 NaN
# 2 90.0
# 3 44.0
# Name: age, dtype: float64
print(data.fillna(0)) # 将缺失值填充为0
# name age
# 0 hpl 18.0
# 1 lw 0.0
# 2 xpz 45.0
# 3 0 22.0
print(data.dropna()) # 将缺失值所在的行直接删除
# name age
# 0 hpl 18.0
# 2 xpz 45.0
data2 = pd.read_csv('test2.csv')
print(data2)
# 0 hpl 18.0
# 1 NaN NaN
# 2 xpz 45.0
# 3 NaN 22.0
print(data2.dropna(how='all')) # how='all' 只有一行都是缺失值时才删除
# name age
# 0 hpl 18.0
# 2 xpz 45.0
# 3 NaN 22.0
print(data2.dropna(how='any')) # how='any' 和不传默认是一样的
# name age
# 0 hpl 18.0
# 2 xpz 45.0
print(data2.dropna(axis=1)) # axis=0 表示行(删除有缺失值的行),axis=1 表示列(删除有缺失值的列)
5,pandas的常用函数及排序
data = pd.read_csv('vegetables.csv', header=None)
# 求平均值
print(data.mean()) # 对每一列求平均值(默认)
# 1 8.242353
# 2 8.863975
# 3 9.520366
# dtype: float64
print(data.mean(axis=1)) # axis=1 按每一行求平均值
# 求和
print(data.sum()) # 默认按列求和
# 1 837151.05
# 2 900287.39
# 3 966954.97
# dtype: float64
print(data.sum(axis=1)) # axis=1 按行进行求和
print(data.max())
# 1 300.0
# 2 315.0
# 3 330.0
# dtype: float64
print(data.min())
# 1 0.2
# 2 0.0
# 3 0.3
# dtype: float64
print(data.var()) # 方差
# dtype: float64
# 1 279.696835
# 2 299.989904
# 3 324.831664
# dtype: float64
print(data.std()) # 标准差
# 1 16.724139
# 2 17.320217
# 3 18.023087
# dtype: float64
# 排序(排序中有NAN空值的不参与排序,统一放到最后面)
print(data.sort_values(by=1)) # by=1 按第一列从小到大进行排序
print(data.sort_values(by=1, ascending=False)) # ascending意思为上升,ascending=False即倒序
print(data.sort_index()) # 按标签进行排序,默认是升序
print(data.sort_index(ascending=False)) # 按标签进行降序
print(data.sort_index(axis=1)) # 按标签进行排序和按列进行排序(顺序由小到大)
print(data.sort_index(ascending=False, axis=1)) # 按标签进行排序和按列进行排序(倒序由大到小)
6,批量 处理 / 生成 时间对象
print(datetime.datetime.strptime('2021-05-19', '%Y-%m-%d')) # 字符串转时间格式 2021-05-19 00:00:00
# pandas 批量处理时间对象
print(pd.to_datetime(['19/05/2021', '2021-05-20', '2021/May/21']))
# DatetimeIndex(['2021-05-19', '2021-05-20', '2021-05-21'], dtype='datetime64[ns]', freq=None)
# pandas 批量生成时间
print(pd.date_range('2021-05-10', '2021-05-19')) # 设置起始和结束,默认是天
# DatetimeIndex(['2021-05-10', '2021-05-11', '2021-05-12', '2021-05-13',
# '2021-05-14', '2021-05-15', '2021-05-16', '2021-05-17',
# '2021-05-18', '2021-05-19'],
# dtype='datetime64[ns]', freq='D')
print(pd.date_range('2021-05-01', periods=9)) # periods=9 指定长度,生成9个,默认是天
# DatetimeIndex(['2021-05-01', '2021-05-02', '2021-05-03', '2021-05-04',
# '2021-05-05', '2021-05-06', '2021-05-07', '2021-05-08',
# '2021-05-09'],
# dtype='datetime64[ns]', freq='D')
# freq='' S/s按秒,T/t按分, H/h按小时, W/w按周, B/b按工作日(周一至周五),W-MON/w-mon按每周一,M/m按月,SM/sm按半个月,Y/y按年,1h20min
print(pd.date_range('2021-05-01', periods=10, freq='1h30min'))
# DatetimeIndex(['2021-05-01 00:00:00', '2021-05-01 01:30:00',
# '2021-05-01 03:00:00', '2021-05-01 04:30:00',
# '2021-05-01 06:00:00', '2021-05-01 07:30:00',
# '2021-05-01 09:00:00', '2021-05-01 10:30:00',
# '2021-05-01 12:00:00', '2021-05-01 13:30:00'],
# dtype='datetime64[ns]', freq='90T')
7,pandas 的时间序列
data = pd.read_csv('vegetables.csv', header=None, index_col=6, parse_dates=True).sort_index()
print(data['2021-05']) # 输出2021年5月对应的值
print(data['2021']) # 输出2021年对应的值
print(data['2020': '2021-02']) # 输出2020年到2021年二月的所有数据
print(data['2020-11-23': '2021-01-15']) # 输出2020-11-23到2021-01-15的所有数据
print(data.resample('d').sum()) # 按天算出总和
print(data.resample('w').sum()) # 按周算出总和
print(data.resample('m').sum()) # 按月算出总和
print(data.resample('y').sum()) # 按年算出总和
四,文件 读取 / 写入👇
1,文件读取
① 数据有列名
data = pd.read_csv('shares_data.csv') # 直接将数据全部干出来,并且下面会显示几行几列
print(data)
date open close high low volume code
0 2001-02-28 3.207 3.033 3.237 3.026 307391.62 600550
1 2001-03-01 3.033 2.991 3.074 2.951 54787.51 600550
2 2001-03-02 3.006 3.000 3.030 2.978 21470.69 600550
3 2001-03-05 3.000 3.020 3.024 2.954 20098.25 600550
4 2001-03-06 3.015 2.986 3.015 2.969 12183.90 600550
... ... ... ... ... ... ... ...
4729 2021-05-12 4.150 4.200 4.250 4.110 62450.00 600550
4730 2021-05-13 4.180 4.120 4.190 4.110 50634.00 600550
4731 2021-05-14 4.120 4.150 4.160 4.110 31483.00 600550
4732 2021-05-17 4.150 4.170 4.200 4.110 48353.00 600550
4733 2021-05-18 4.170 4.180 4.180 4.150 27904.00 600550
[4734 rows x 7 columns]
data = pd.read_csv('shares_data.csv')
print(data.head(5)) # head(5)只读前5行
date open close high low volume code
0 2001-02-28 3.207 3.033 3.237 3.026 307391.62 600550
1 2001-03-01 3.033 2.991 3.074 2.951 54787.51 600550
2 2001-03-02 3.006 3.000 3.030 2.978 21470.69 600550
3 2001-03-05 3.000 3.020 3.024 2.954 20098.25 600550
4 2001-03-06 3.015 2.986 3.015 2.969 12183.90 600550
data = pd.read_csv('shares_data.csv', index_col='date') # index_col='date'将date列设为索引列
print(data)
print(data.index)
open close high low volume code
date
2001-02-28 3.207 3.033 3.237 3.026 307391.62 600550
2001-03-01 3.033 2.991 3.074 2.951 54787.51 600550
2001-03-02 3.006 3.000 3.030 2.978 21470.69 600550
2001-03-05 3.000 3.020 3.024 2.954 20098.25 600550
2001-03-06 3.015 2.986 3.015 2.969 12183.90 600550
... ... ... ... ... ... ...
2021-05-12 4.150 4.200 4.250 4.110 62450.00 600550
2021-05-13 4.180 4.120 4.190 4.110 50634.00 600550
2021-05-14 4.120 4.150 4.160 4.110 31483.00 600550
2021-05-17 4.150 4.170 4.200 4.110 48353.00 600550
2021-05-18 4.170 4.180 4.180 4.150 27904.00 600550
[4734 rows x 6 columns]
Index(['2001-02-28', '2001-03-01', '2001-03-02', '2001-03-05', '2001-03-06',
'2001-03-07', '2001-03-08', '2001-03-09', '2001-03-12', '2001-03-13',
...
'2021-04-30', '2021-05-06', '2021-05-07', '2021-05-10', '2021-05-11',
'2021-05-12', '2021-05-13', '2021-05-14', '2021-05-17', '2021-05-18'],
dtype='object', name='date', length=4734)
data = pd.read_csv('shares_data.csv', index_col='date', parse_dates=True) # index_col='date'将date列设为索引列,parse_dates=True 将所有能解释成时间序列类型的列都解释成时间序列(parse_dates=['date'] 将date列解释成时间序列类型)
print(data)
print(data.index)
open close high low volume code
date
2001-02-28 3.207 3.033 3.237 3.026 307391.62 600550
2001-03-01 3.033 2.991 3.074 2.951 54787.51 600550
2001-03-02 3.006 3.000 3.030 2.978 21470.69 600550
2001-03-05 3.000 3.020 3.024 2.954 20098.25 600550
2001-03-06 3.015 2.986 3.015 2.969 12183.90 600550
... ... ... ... ... ... ...
2021-05-12 4.150 4.200 4.250 4.110 62450.00 600550
2021-05-13 4.180 4.120 4.190 4.110 50634.00 600550
2021-05-14 4.120 4.150 4.160 4.110 31483.00 600550
2021-05-17 4.150 4.170 4.200 4.110 48353.00 600550
2021-05-18 4.170 4.180 4.180 4.150 27904.00 600550
[4734 rows x 6 columns]
DatetimeIndex(['2001-02-28', '2001-03-01', '2001-03-02', '2001-03-05',
'2001-03-06', '2001-03-07', '2001-03-08', '2001-03-09',
'2001-03-12', '2001-03-13',
...
'2021-04-30', '2021-05-06', '2021-05-07', '2021-05-10',
'2021-05-11', '2021-05-12', '2021-05-13', '2021-05-14',
'2021-05-17', '2021-05-18'],
dtype='datetime64[ns]', name='date', length=4734, freq=None)
②数据无列名
data = pd.read_csv('shares_data_2.csv', header=None) # header=None 假如数据没有列名,默认会把列名命名为从0开始的递增数字,如0,1,2,3,4....
print(data)
0 1 2 3 4 5 6
0 2001-02-28 3.207 3.033 3.237 3.0260000000000002 307391.62 600550
1 2001-03-01 3.033 2.991 3.074 none 54787.51 600550
2 2001-03-02 3.006 3.000 3.030 2.978 21470.69 600550
3 2001-03-05 NaN 3.020 3.024 2.9539999999999997 20098.25 600550
4 2001-03-06 3.015 2.986 3.015 2.969 12183.90 600550
... ... ... ... ... ... ... ...
4729 2021-05-12 4.150 4.200 4.250 4.11 62450.00 600550
4730 2021-05-13 4.180 4.120 4.190 4.11 50634.00 600550
4731 2021-05-14 4.120 4.150 4.160 4.11 31483.00 600550
4732 2021-05-17 4.150 4.170 4.200 4.11 48353.00 600550
4733 2021-05-18 4.170 4.180 4.180 4.15 27904.00 600550
[4734 rows x 7 columns]
data = pd.read_csv('shares_data_2.csv', names=['a', 'b', 'c', 'd', 'e', 'f', 'g']) # names=[] 自定义列名
data = pd.read_csv('shares_data_2.csv', names=['a', 'b', 'c', 'd', 'e', 'f', 'g'], skiprows=[1, 2, 3, 4]) # skiprows=[] 跳过指定的行
③处理缺失值
data = pd.read_csv('shares_data_2.csv', header=None)
print(data.head(5))
print(data[4]) # dtype: object 即none将整列成字符串了
0 1 2 3 4 5 6
0 2001-02-28 3.207 3.033 3.237 3.0260000000000002 307391.62 600550
1 2001-03-01 3.033 2.991 3.074 none 54787.51 600550
2 2001-03-02 3.006 3.000 3.030 2.978 21470.69 600550
3 2001-03-05 NaN 3.020 3.024 2.9539999999999997 20098.25 600550
4 2001-03-06 3.015 2.986 3.015 2.969 12183.90 600550
0 3.0260000000000002
1 none
2 2.978
3 2.9539999999999997
4 2.969
...
4729 4.11
4730 4.11
4731 4.11
4732 4.11
4733 4.15
Name: 4, Length: 4734, dtype: object
data = pd.read_csv('shares_data_2.csv', header=None, na_values=['none', 'null']) # na_values=['none', 'null'] 指定那些字符串解释为空
print(data[4]) # dtype: float64
print(data.head(5))
0 3.026
1 NaN
2 2.978
3 2.954
4 2.969
...
4729 4.110
4730 4.110
4731 4.110
4732 4.110
4733 4.150
Name: 4, Length: 4734, dtype: float64
0 1 2 3 4 5 6
0 2001-02-28 3.207 3.033 3.237 3.026 307391.62 600550
1 2001-03-01 3.033 2.991 3.074 NaN 54787.51 600550
2 2001-03-02 3.006 3.000 3.030 2.978 21470.69 600550
3 2001-03-05 NaN 3.020 3.024 2.954 20098.25 600550
4 2001-03-06 3.015 2.986 3.015 2.969 12183.90 600550
2,文件写入
data = pd.read_csv('shares_data.csv')
data.to_csv('shares_data_2.csv', header=False, index=False, encoding='utf-8') # header=False 不写入列名, index=False 不写入索引列
data.to_csv('data_3.csv', header=False, index=False, columns=[0, 1, 2, 3, 4]) # columns=[0, 1, 2, 3, 4] 指定输出那几行
data.to_csv('data_4.csv', header=False, index=False, columns=[0, 1, 2, 3, 4], na_rep='NULL') # na_rep='NULL' 指定缺失值以什么写入文件
到此,pandas基本学习完成,现在就可以下山了!
转载:https://blog.csdn.net/hpl980342791/article/details/117060152
查看评论