小言_互联网的博客

python 非值数据

378人阅读  评论(0)
#非值数据  Pandas中有哪些非值数据
import pandas as pd
from numpy import NaN,NAN,nan

#NaN 是被遗失的数据
print(NaN == True)
print(NaN == False)
print(NaN == 0)
print(NaN == '')

#通过函数来判断是否是NaN
x = NaN
y = NaN
n= 20
print(pd.isnull(x))
print(pd.isnull(y))
#不是非值
print(pd.notnull(n))

(venv) E:\py3>python py21.py
False
False
False
False
True
True
True

#装载数据
visited_file = './data/survey_visited.csv'
#print(pd.read_csv(visited_file))
#把NAN 当做一个正常的值 ,当做一个空格
print(pd.read_csv(visited_file,na_values=[' '],keep_default_na = False))

ident site dated
0 619 DR-1 1927-02-08
1 622 DR-1 1927-02-10
2 734 DR-3 1939-01-07
3 735 DR-3 1930-01-12
4 751 DR-3 1930-02-26
5 752 DR-3
6 837 MSK-4 1932-01-14
7 844 DR-1 1932-03-22

#合并数据
visited = pd.read_csv('./data/survey_visited.csv')
survey = pd.read_csv('./data/survey_survey.csv')
vs = visited.merge(survey,left_on = 'ident',right_on= 'taken')
print(vs)

(venv) E:\py3>python py21.py
ident site dated taken person quant reading
0 619 DR-1 1927-02-08 619 dyer rad 9.82
1 619 DR-1 1927-02-08 619 dyer sal 0.13
2 622 DR-1 1927-02-10 622 dyer rad 7.80
3 622 DR-1 1927-02-10 622 dyer sal 0.09
4 734 DR-3 1939-01-07 734 pb rad 8.41
5 734 DR-3 1939-01-07 734 lake sal 0.05
6 734 DR-3 1939-01-07 734 pb temp -21.50
7 735 DR-3 1930-01-12 735 pb rad 7.22
8 735 DR-3 1930-01-12 735 NaN sal 0.06
9 735 DR-3 1930-01-12 735 NaN temp -26.00
10 751 DR-3 1930-02-26 751 pb rad 4.35
11 751 DR-3 1930-02-26 751 pb temp -18.50
12 751 DR-3 1930-02-26 751 lake sal 0.10
13 752 DR-3 NaN 752 lake rad 2.19
14 752 DR-3 NaN 752 lake sal 0.09
15 752 DR-3 NaN 752 lake temp -16.00
16 752 DR-3 NaN 752 roe sal 41.60
17 837 MSK-4 1932-01-14 837 lake rad 1.46
18 837 MSK-4 1932-01-14 837 lake sal 0.21
19 837 MSK-4 1932-01-14 837 roe sal 22.50
20 844 DR-1 1932-03-22 844 roe rad 11.25


# 用户输入
scientists = pd.DataFrame({
    'Name':['Bill','Mike'],
    'Occupation':['Chemist','Statistician']
})
print(scientists)

(venv) E:\py3>python py21.py
Name Occupation
0 Bill Chemist
1 Mike Statistician

from numpy import NaN,NAN,nan
scientists['missing'] = nan
print(scientists)

(venv) E:\py3>python py21.py
Name Occupation missing
0 Bill Chemist NaN
1 Mike Statistician NaN

#重建索引
gapminder = pd.read_csv('./data/gapminder.tsv',sep='\t')
life_exp = gapminder.groupby(['year'])['lifeExp'].mean()
#print(life_exp)
#print(life_exp.loc[range(2000,2003)])

数据只有2002的值,其他的没有只所有用NaN
year
2000 NaN
2001 NaN
2002 65.694923

import pandas as pd


#1 填充NaN
#2 删除包含NaN的行
gapminder = pd.read_csv('./data/gapminder.tsv',sep='\t')
life_exp = gapminder.groupby(['year'])['lifeExp'].mean()

a = life_exp.loc[range(2000,2010)]
print(a)
#把NaN填充0 ,全都是0
print(a.fillna(0))
print(a.fillna('*'))

#forward填充  用前面的值,来填充当前值NaN
print(a.fillna(method='ffill'))

#backward填充  用后面的值,来填充当前值NaN
print(a.fillna(method='bfill'))

#前后都填充了  这个就是前后都填充了
print(a.fillna(method='bfill').fillna(method='ffill'))

#相邻两个数的差值,在除以间隔的数
print(a.interpolate())

from numpy import NaN
#4-2+中间的间隔(1个)
aa = pd.Series([NaN,NaN,2,NaN,4,NaN,6,NaN,NaN,9,NaN,NaN,NaN])

6-4=2 2/2=1 这是差值 4+1 =5   步长是1
9-6=3  3除以间隔为2   /2+1=3
#删除包含 NaN的行
print(aa.dropna())

year
2000 NaN
2001 NaN
2002 65.694923
2003 NaN
2004 NaN
2005 NaN
2006 NaN
2007 67.007423
2008 NaN
2009 NaN
Name: lifeExp, dtype: float64
year
2000 0.000000
2001 0.000000
2002 65.694923
2003 0.000000
2004 0.000000
2005 0.000000
2006 0.000000
2007 67.007423
2008 0.000000
2009 0.000000
Name: lifeExp, dtype: float64
year
2000 *
2001 *
2002 65.6949
2003 *
2004 *
2005 *
2006 *
2007 67.0074
2008 *
2009 *
Name: lifeExp, dtype: object
year
2000 NaN
2001 NaN
2002 65.694923
2003 65.694923
2004 65.694923
2005 65.694923
2006 65.694923
2007 67.007423
2008 67.007423
2009 67.007423
Name: lifeExp, dtype: float64
year
2000 65.694923
2001 65.694923
2002 65.694923
2003 67.007423
2004 67.007423
2005 67.007423
2006 67.007423
2007 67.007423
2008 NaN
2009 NaN
Name: lifeExp, dtype: float64
year
2000 65.694923
2001 65.694923
2002 65.694923
2003 67.007423
2004 67.007423
2005 67.007423
2006 67.007423
2007 67.007423
2008 67.007423
2009 67.007423
Name: lifeExp, dtype: float64
year
2000 NaN
2001 NaN
2002 65.694923
2003 65.957423
2004 66.219923
2005 66.482423
2006 66.744923
2007 67.007423
2008 67.007423
2009 67.007423
Name: lifeExp, dtype: float64
2 2.0
4 4.0
6 6.0
9 9.0
dtype: float64


转载:https://blog.csdn.net/u012164509/article/details/101770147
查看评论
* 以上用户言论只代表其个人观点,不代表本网站的观点或立场