Pandas


In [1]:
import numpy as np
import pandas as pd

Series

In [2]:
arr = np.array([10, 20, 30])
labes = ['a', 'b', 'c']

pd.Series(arr, labes)
Out[2]:
a    10
b    20
c    30
dtype: int64
In [4]:
sales1 = pd.Series( [100, 200, 150, 250], ['US', 'HUN', "GB", "GER"] )
sales2 = pd.Series( [200, 500, 150, 250], ['US', 'HUN', "GB", "FR"] )
sales1
Out[4]:
US     100
HUN    200
GB     150
GER    250
dtype: int64
In [5]:
print( sales1[0] )
print( sales1['US'] )
100
100
In [7]:
sales1 + sales2  # nan because the item only show in one of the series
Out[7]:
FR       NaN
GB     300.0
GER      NaN
HUN    700.0
US     300.0
dtype: float64

Data frames

In [8]:
columns = ['W', 'X', 'Y', 'Z']
index = ['A', 'B', 'C', 'D', 'E']
In [10]:
from numpy.random import randint

np.random.seed(42) # generate the same random numbers
data = randint(-100, 100, (5,4))
data
Out[10]:
array([[  2,  79,  -8, -86],
       [  6, -29,  88, -80],
       [  2,  21, -26, -13],
       [ 16,  -1,   3,  51],
       [ 30,  49, -48, -99]])
In [12]:
df = pd.DataFrame(data, index, columns)
df
Out[12]:
W X Y Z
A 2 79 -8 -86
B 6 -29 88 -80
C 2 21 -26 -13
D 16 -1 3 51
E 30 49 -48 -99
In [13]:
df['W']
Out[13]:
A     2
B     6
C     2
D    16
E    30
Name: W, dtype: int64
In [14]:
df['W']['A']
Out[14]:
2
In [15]:
df[['W', 'Z']]
Out[15]:
W Z
A 2 -86
B 6 -80
C 2 -13
D 16 51
E 30 -99
In [16]:
df['NEW'] = df['W'] + df['Y']
df
Out[16]:
W X Y Z NEW
A 2 79 -8 -86 -6
B 6 -29 88 -80 94
C 2 21 -26 -13 -24
D 16 -1 3 51 19
E 30 49 -48 -99 -18
In [20]:
df = df.drop('NEW', axis=1)
df
Out[20]:
W X Y Z
A 2 79 -8 -86
B 6 -29 88 -80
C 2 21 -26 -13
D 16 -1 3 51
E 30 49 -48 -99
In [23]:
# to select a row use loc
df.loc['A']
Out[23]:
W     2
X    79
Y    -8
Z   -86
Name: A, dtype: int64
In [24]:
df.loc[['A', 'E']]
Out[24]:
W X Y Z
A 2 79 -8 -86
E 30 49 -48 -99
In [25]:
df.iloc[1:3]
Out[25]:
W X Y Z
B 6 -29 88 -80
C 2 21 -26 -13
In [27]:
df.loc[['A', 'C'], ['W', 'Y']]
Out[27]:
W Y
A 2 -8
C 2 -26

Condition search

In [28]:
df > 0
Out[28]:
W X Y Z
A True True False False
B True False True False
C True True False False
D True False True True
E True True False False
In [30]:
df[df > 0] # NaN is missing data i.e. not true
Out[30]:
W X Y Z
A 2 79.0 NaN NaN
B 6 NaN 88.0 NaN
C 2 21.0 NaN NaN
D 16 NaN 3.0 51.0
E 30 49.0 NaN NaN
In [31]:
df[ df['X'] > 0 ]
Out[31]:
W X Y Z
A 2 79 -8 -86
C 2 21 -26 -13
E 30 49 -48 -99
In [32]:
df[ df['X'] > 0 ]['W']
Out[32]:
A     2
C     2
E    30
Name: W, dtype: int64
In [34]:
# & and -- | or

df[ (df['W']>0) & (df['Y']>1) ]
Out[34]:
W X Y Z
B 6 -29 88 -80
D 16 -1 3 51
In [35]:
df.reset_index()
Out[35]:
index W X Y Z
0 A 2 79 -8 -86
1 B 6 -29 88 -80
2 C 2 21 -26 -13
3 D 16 -1 3 51
4 E 30 49 -48 -99
In [36]:
new_ind = ['CA', 'NY', 'WY', 'OR', 'CO']
df['States'] = new_ind
df
Out[36]:
W X Y Z States
A 2 79 -8 -86 CA
B 6 -29 88 -80 NY
C 2 21 -26 -13 WY
D 16 -1 3 51 OR
E 30 49 -48 -99 CO
In [38]:
df = df.set_index('States')
df
Out[38]:
W X Y Z
States
CA 2 79 -8 -86
NY 6 -29 88 -80
WY 2 21 -26 -13
OR 16 -1 3 51
CO 30 49 -48 -99
In [39]:
df.columns
Out[39]:
Index(['W', 'X', 'Y', 'Z'], dtype='object')
In [40]:
df.describe()
Out[40]:
W X Y Z
count 5.00000 5.000000 5.000000 5.000000
mean 11.20000 23.800000 1.800000 -45.400000
std 11.96662 42.109381 51.915316 63.366395
min 2.00000 -29.000000 -48.000000 -99.000000
25% 2.00000 -1.000000 -26.000000 -86.000000
50% 6.00000 21.000000 -8.000000 -80.000000
75% 16.00000 49.000000 3.000000 -13.000000
max 30.00000 79.000000 88.000000 51.000000
In [41]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, CA to CO
Data columns (total 4 columns):
W    5 non-null int64
X    5 non-null int64
Y    5 non-null int64
Z    5 non-null int64
dtypes: int64(4)
memory usage: 200.0+ bytes

Missing data

In [43]:
df = pd.DataFrame( {'A' : [1, 2, np.nan, 4],
                    'B' : [5, np.nan, np.nan, 8],
                    'C' : [10, 20, 30, 40]})
df
Out[43]:
A B C
0 1.0 5.0 10
1 2.0 NaN 20
2 NaN NaN 30
3 4.0 8.0 40
In [45]:
df.dropna() # rows where nothing is missing
Out[45]:
A B C
0 1.0 5.0 10
3 4.0 8.0 40
In [46]:
df.dropna(axis=1) # cols where nothing is missing
Out[46]:
C
0 10
1 20
2 30
3 40
In [54]:
# cols where less nan than threshold (thresh is percentage 1/thresh)
# if thresh = 2 than 50%

df.dropna(axis=1, thresh=3) 
Out[54]:
A C
0 1.0 10
1 2.0 20
2 NaN 30
3 4.0 40
In [55]:
# filling nan
df.fillna(value=0)
Out[55]:
A B C
0 1.0 5.0 10
1 2.0 0.0 20
2 0.0 0.0 30
3 4.0 8.0 40
In [58]:
df['A'] = df['A'].fillna(value=0)
In [59]:
df
Out[59]:
A B C
0 1.0 5.0 10
1 2.0 NaN 20
2 0.0 NaN 30
3 4.0 8.0 40
In [60]:
df['B'].fillna(df['B'].mean())
Out[60]:
0    5.0
1    6.5
2    6.5
3    8.0
Name: B, dtype: float64

GroupBy operations

In [63]:
df = pd.read_csv('Universities.csv')
df.head() # first few elements
Out[63]:
Sector University Year Completions Geography
0 Private for-profit, 2-year Pima Medical Institute-Las Vegas 2016 591 Nevada
1 Private for-profit, less-than 2-year Healthcare Preparatory Institute 2016 28 Nevada
2 Private for-profit, less-than 2-year Milan Institute-Las Vegas 2016 408 Nevada
3 Private for-profit, less-than 2-year Utah College of Massage Therapy-Vegas 2016 240 Nevada
4 Public, 4-year or above Western Nevada College 2016 960 Nevada
In [64]:
df.groupby('Year').sum()
Out[64]:
Completions
Year
2012 20333
2013 21046
2014 24730
2015 26279
2016 26224
In [65]:
df.groupby('Year').sum().sort_index(ascending=False) # csokkenoben evszamok szerint
Out[65]:
Completions
Year
2016 26224
2015 26279
2014 24730
2013 21046
2012 20333
In [66]:
df.groupby(['Year', 'Sector']).sum()
Out[66]:
Completions
Year Sector
2012 Private for-profit, 2-year 3072
Private for-profit, 4-year or above 632
Private for-profit, less-than 2-year 1327
Private not-for-profit, 2-year 665
Private not-for-profit, 4-year or above 1059
Public, 2-year 1170
Public, 4-year or above 12408
2013 Private for-profit, 2-year 3053
Private for-profit, 4-year or above 775
Private for-profit, less-than 2-year 1281
Private not-for-profit, 2-year 471
Private not-for-profit, 4-year or above 1016
Public, 2-year 1633
Public, 4-year or above 12817
2014 Private for-profit, 2-year 2957
Private for-profit, 4-year or above 1506
Private for-profit, less-than 2-year 1328
Private not-for-profit, 2-year 449
Private not-for-profit, 4-year or above 1042
Public, 2-year 2286
Public, 4-year or above 15162
2015 Private for-profit, 2-year 3280
Private for-profit, 4-year or above 1306
Private for-profit, less-than 2-year 1629
Private not-for-profit, 2-year 425
Private not-for-profit, 4-year or above 1228
Public, 2-year 2355
Public, 4-year or above 16056
2016 Private for-profit, 2-year 3286
Private for-profit, 4-year or above 748
Private for-profit, less-than 2-year 1552
Private not-for-profit, 2-year 322
Private not-for-profit, 4-year or above 1208
Public, 2-year 2431
Public, 4-year or above 16677
In [67]:
df.groupby('Year').describe()
Out[67]:
Completions
count mean std min 25% 50% 75% max
Year
2012 38.0 535.078947 1036.433239 13.0 114.25 229.5 420.50 5388.0
2013 40.0 526.150000 1040.474782 0.0 98.50 189.0 413.00 5278.0
2014 42.0 588.809524 1150.355857 0.0 104.50 203.5 371.75 5093.0
2015 44.0 597.250000 1183.371791 0.0 87.75 191.0 405.75 5335.0
2016 43.0 609.860465 1235.952796 0.0 90.00 208.0 414.00 5367.0

Pandas operations

In [68]:
df_one = pd.DataFrame({'k1':['A', 'A', 'B', 'B', 'C', 'C'],
                       'col1':[100, 200, 300, 300, 400, 500],
                       'col2':['NY', 'CA', 'WA', 'WA', 'AK', 'NV']})
df_one
Out[68]:
k1 col1 col2
0 A 100 NY
1 A 200 CA
2 B 300 WA
3 B 300 WA
4 C 400 AK
5 C 500 NV
In [69]:
#unique values
df_one['col2'].unique()
Out[69]:
array(['NY', 'CA', 'WA', 'AK', 'NV'], dtype=object)
In [70]:
# number of unique values
df_one['col2'].nunique()
Out[70]:
5
In [71]:
df_one['col2'].value_counts()
Out[71]:
WA    2
CA    1
NY    1
NV    1
AK    1
Name: col2, dtype: int64
In [73]:
df_one.drop_duplicates()
Out[73]:
k1 col1 col2
0 A 100 NY
1 A 200 CA
2 B 300 WA
4 C 400 AK
5 C 500 NV
In [77]:
df_one['NEW'] = df_one['col1'] * 10
df_one
Out[77]:
k1 col1 col2 NEW
0 A 100 NY 1000
1 A 200 CA 2000
2 B 300 WA 3000
3 B 300 WA 3000
4 C 400 AK 4000
5 C 500 NV 5000
In [78]:
def grab_first_letter(state):
    return state[0]

grab_first_letter('NY')
Out[78]:
'N'
In [79]:
df_one['col2'].apply(grab_first_letter)
Out[79]:
0    N
1    C
2    W
3    W
4    A
5    N
Name: col2, dtype: object
In [81]:
df_one['first_letter'] = df_one['col2'].apply(grab_first_letter)
df_one
Out[81]:
k1 col1 col2 NEW first_letter
0 A 100 NY 1000 N
1 A 200 CA 2000 C
2 B 300 WA 3000 W
3 B 300 WA 3000 W
4 C 400 AK 4000 A
5 C 500 NV 5000 N
In [86]:
def complex_letter(state):
    if state[0] == 'W':
        return 'Washington'
    else:
        return 'Error'
    
df_one['col2'].apply(complex_letter)
Out[86]:
0         Error
1         Error
2    Washington
3    Washington
4         Error
5         Error
Name: col2, dtype: object
In [88]:
# create new columns with mapping
my_map = {'A':1, 'B':2, 'C':3}
df_one['k1'].map(my_map)
Out[88]:
0    1
1    1
2    2
3    2
4    3
5    3
Name: k1, dtype: int64
In [90]:
df_one['new_map'] = df_one['k1'].map(my_map)
df_one
Out[90]:
k1 col1 col2 NEW first_letter new_map
0 A 100 NY 1000 N 1
1 A 200 CA 2000 C 1
2 B 300 WA 3000 W 2
3 B 300 WA 3000 W 2
4 C 400 AK 4000 A 3
5 C 500 NV 5000 N 3
In [92]:
# max values
print( df_one['col1'].max() )
print( df_one['col1'].idxmax() )
500
5
In [95]:
df_one.columns = ['c1', 'c2', 'c3', 'c4', 'c5', 'c6']
df_one
Out[95]:
c1 c2 c3 c4 c5 c6
0 A 100 NY 1000 N 1
1 A 200 CA 2000 C 1
2 B 300 WA 3000 W 2
3 B 300 WA 3000 W 2
4 C 400 AK 4000 A 3
5 C 500 NV 5000 N 3
In [96]:
#sorting
df_one.sort_values('c3')
Out[96]:
c1 c2 c3 c4 c5 c6
4 C 400 AK 4000 A 3
1 A 200 CA 2000 C 1
5 C 500 NV 5000 N 3
0 A 100 NY 1000 N 1
2 B 300 WA 3000 W 2
3 B 300 WA 3000 W 2
In [97]:
# concatenating

features = pd.DataFrame({'A':[100, 200, 300, 400, 500],
                         'B':[12, 13, 14, 15, 16]})

predictions = pd.DataFrame({'pred':[0, 1, 1, 0, 1]})
In [99]:
pd.concat([features, predictions], axis=1)
Out[99]:
A B pred
0 100 12 0
1 200 13 1
2 300 14 1
3 400 15 0
4 500 16 1
In [100]:
#one hot encoding
pd.get_dummies(df_one['c1'])
Out[100]:
A B C
0 1 0 0
1 1 0 0
2 0 1 0
3 0 1 0
4 0 0 1
5 0 0 1