import numpy as np
import pandas as pd
from pandas import Series, DataFrame
Numpy
数据类型)和数据标签(索引)组成Python
的字典类型创建obj = pd.Series([4, 7, 8, -1])
obj
0 4
1 7
2 8
3 -1
dtype: int64
# 指定索引值
obj1 = pd.Series([1, 2, 3, 4],
index=['a', 'b', 'c', 'd'])
obj1
a 1
b 2
c 3
d 4
dtype: int64
# 通过字典的形式创建Series数据
# S中的索引就是原来字典中的键
data1 = {"city": "shenzhen", "age": 25, "number": 123456}
obj2 = pd.Series(data1)
obj2
city shenzhen
age 25
number 123456
dtype: object
Series
中值的获取Series
自己创建时候的索引# 通过自己创建的索引来获取数据
obj1[['a', 'c', 'b']]
a 1
c 3
b 2
dtype: int64
# 默认数值索引来获取数据
obj1[:3]
a 1
b 2
c 3
dtype: int64
# 布尔型数组过滤掉不满足要求的数据
obj1[obj1 >= 2]
b 2
c 3
d 4
dtype: int64
# 上面的obj1
obj1
a 1
b 2
c 3
d 4
dtype: int64
obj1['b']
2
# 通过切片形式,包含末端,和Python1不同
obj1["a":"c"]
a 1
b 2
c 3
dtype: int64
obj1[["b", "a", "d"]]
b 2
a 1
d 4
dtype: int64
obj1[1]
2
obj1[1:4]
b 2
c 3
d 4
dtype: int64
obj1[[1, 3]]
b 2
d 4
dtype: int64
obj1[obj1 > 2]
c 3
d 4
dtype: int64
NaN
表示isnull
和notnull
检测缺失值# 上面的obj2
obj2
city shenzhen
age 25
number 123456
dtype: object
# sex对应的值找不到,用NaN表示
data2 = ["sex", "city", "age", "number"]
obj3 = pd.Series(obj2, index=data2)
obj3
sex NaN
city shenzhen
age 25
number 123456
dtype: object
pd.isnull(obj3)
sex True
city False
age False
number False
dtype: bool
# Series的实例方法isnull和notnull
obj3.isnull()
sex True
city False
age False
number False
dtype: bool
Series
对象的name
属性S
数据本身和索引都有name
属性name
属性的值# S的name属性
obj3.name = "person"
# S索引的name属性
obj3.index.name = "information"
obj3
information
sex NaN
city shenzhen
age 25
number 123456
Name: person, dtype: object
# 上面栗子中的number修改为phone_num
obj3.index = ["sex", "city", "age", "phone_num"]
obj3
sex NaN
city shenzhen
age 25
phone_num 123456
Name: person, dtype: object
columns
参数指定各个属性的顺序# 1.通过传入等长列表或者Numpy数组组成的字典
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
'year': [2000, 2001, 2002, 2001, 2002, 2003],
'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)
frame
state | year | pop | |
---|---|---|---|
0 | Ohio | 2000 | 1.5 |
1 | Ohio | 2001 | 1.7 |
2 | Ohio | 2002 | 3.6 |
3 | Nevada | 2001 | 2.4 |
4 | Nevada | 2002 | 2.9 |
5 | Nevada | 2003 | 3.2 |
# 2. 指定列序列创建,通过columns参数
# 结果中3个列属性和上面的顺序不同
pd.DataFrame(data, columns=["year", "state", "pop"])
year | state | pop | |
---|---|---|---|
0 | 2000 | Ohio | 1.5 |
1 | 2001 | Ohio | 1.7 |
2 | 2002 | Ohio | 3.6 |
3 | 2001 | Nevada | 2.4 |
4 | 2002 | Nevada | 2.9 |
5 | 2003 | Nevada | 3.2 |
# 3、columns中的属性如果不存在,则结果中用缺失值代替,debt属性
# 4、在DF中传入指定的index,有one-six
frame2 = pd.DataFrame(data, columns=["year", "state", "pop", "debt"],
index=["one", "two", "three", "four", "five", "six"])
frame2
year | state | pop | debt | |
---|---|---|---|---|
one | 2000 | Ohio | 1.5 | NaN |
two | 2001 | Ohio | 1.7 | NaN |
three | 2002 | Ohio | 3.6 | NaN |
four | 2001 | Nevada | 2.4 | NaN |
five | 2002 | Nevada | 2.9 | NaN |
six | 2003 | Nevada | 3.2 | NaN |
numpy
生成的数据# 1、获取DF中的列属性和索引
print(frame2.columns)
print(frame2.index)
Index(['year', 'state', 'pop', 'debt'], dtype='object')
Index(['one', 'two', 'three', 'four', 'five', 'six'], dtype='object')
# 2、查看values
frame2.values
array([[2000, 'Ohio', 1.5, nan],
[2001, 'Ohio', 1.7, nan],
[2002, 'Ohio', 3.6, nan],
[2001, 'Nevada', 2.4, nan],
[2002, 'Nevada', 2.9, nan],
[2003, 'Nevada', 3.2, nan]], dtype=object)
# 3、查看列数据
frame2.year
one 2000
two 2001
three 2002
four 2001
five 2002
six 2003
Name: year, dtype: int64
frame2["year"]
one 2000
two 2001
three 2002
four 2001
five 2002
six 2003
Name: year, dtype: int64
# 4、赋值修改列
# 注意是整列的修改
frame2["debt"] = 18
frame2
year | state | pop | debt | |
---|---|---|---|---|
one | 2000 | Ohio | 1.5 | 18 |
two | 2001 | Ohio | 1.7 | 18 |
three | 2002 | Ohio | 3.6 | 18 |
four | 2001 | Nevada | 2.4 | 18 |
five | 2002 | Nevada | 2.9 | 18 |
six | 2003 | Nevada | 3.2 | 18 |
# 传入numpy数据
frame2["debt"] = np.arange(6.0)
frame2
year | state | pop | debt | |
---|---|---|---|---|
one | 2000 | Ohio | 1.5 | 0.0 |
two | 2001 | Ohio | 1.7 | 1.0 |
three | 2002 | Ohio | 3.6 | 2.0 |
four | 2001 | Nevada | 2.4 | 3.0 |
five | 2002 | Nevada | 2.9 | 4.0 |
six | 2003 | Nevada | 3.2 | 5.0 |
# 传入S型数据
val = pd.Series([1.2, 1.9, 2], index=["one", "three", "five"])
frame2["debt"] = val
frame2
year | state | pop | debt | |
---|---|---|---|---|
one | 2000 | Ohio | 1.5 | 1.2 |
two | 2001 | Ohio | 1.7 | NaN |
three | 2002 | Ohio | 3.6 | 1.9 |
four | 2001 | Nevada | 2.4 | NaN |
five | 2002 | Nevada | 2.9 | 2.0 |
six | 2003 | Nevada | 3.2 | NaN |
# 1、2
# 先判断state属性的值是否为Ohio
# 如果等于,将eastern属性的值设为T,否则为F
# eastern属性是新建的,只能通过字典标记的形式
frame2["eastern"] = (frame2.state == "Ohio")
frame2
year | state | pop | debt | eastern | |
---|---|---|---|---|---|
one | 2000 | Ohio | 1.5 | 1.2 | True |
two | 2001 | Ohio | 1.7 | NaN | True |
three | 2002 | Ohio | 3.6 | 1.9 | True |
four | 2001 | Nevada | 2.4 | NaN | False |
five | 2002 | Nevada | 2.9 | 2.0 | False |
six | 2003 | Nevada | 3.2 | NaN | False |
# 3 删除数据
del frame2["eastern"]
frame2
year | state | pop | debt | |
---|---|---|---|---|
one | 2000 | Ohio | 1.5 | 1.2 |
two | 2001 | Ohio | 1.7 | NaN |
three | 2002 | Ohio | 3.6 | 1.9 |
four | 2001 | Nevada | 2.4 | NaN |
five | 2002 | Nevada | 2.9 | 2.0 |
six | 2003 | Nevada | 3.2 | NaN |
frame2["pop"]
one 1.5
two 1.7
three 3.6
four 2.4
five 2.9
six 3.2
Name: pop, dtype: float64
frame2.pop
<bound method NDFrame.pop of year state pop debt
one 2000 Ohio 1.5 1.2
two 2001 Ohio 1.7 NaN
three 2002 Ohio 3.6 1.9
four 2001 Nevada 2.4 NaN
five 2002 Nevada 2.9 2.0
six 2003 Nevada 3.2 NaN>
# 4 嵌套字典创建DF:外层为列属性,内层为行
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
frame3 = pd.DataFrame(pop)
frame3
Nevada | Ohio | |
---|---|---|
2000 | NaN | 1.5 |
2001 | 2.4 | 1.7 |
2002 | 2.9 | 3.6 |
# 5、转置操作
frame3.T
2000 | 2001 | 2002 | |
---|---|---|---|
Nevada | NaN | 2.4 | 2.9 |
Ohio | 1.5 | 1.7 | 3.6 |
# 6 、DF中传入S型数据
pdata = {'Ohio': frame3['Ohio'][:-1],
'Nevada': frame3['Nevada'][:2]}
pd.DataFrame(pdata)
Ohio | Nevada | |
---|---|---|
2000 | 1.5 | NaN |
2001 | 1.7 | 2.4 |
# 获取DF中的所有数据
frame2.values
array([[2000, 'Ohio', 1.5, 1.2],
[2001, 'Ohio', 1.7, nan],
[2002, 'Ohio', 3.6, 1.9],
[2001, 'Nevada', 2.4, nan],
[2002, 'Nevada', 2.9, 2.0],
[2003, 'Nevada', 3.2, nan]], dtype=object)
Stay Foolish Stay Hungry