Pandas

import numpy as np
import pandas as pd

Series

arr = np.array([10, 20, 30])
labes = ['a', 'b', 'c']

pd.Series(arr, labes)

a    10
b    20
c    30
dtype: int64

sales1 = pd.Series( [100, 200, 150, 250], ['US', 'HUN', "GB", "GER"] )
sales2 = pd.Series( [200, 500, 150, 250], ['US', 'HUN', "GB", "FR"] )
sales1

US     100
HUN    200
GB     150
GER    250
dtype: int64

print( sales1[0] )
print( sales1['US'] )

100
100

sales1 + sales2  # nan because the item only show in one of the series

FR       NaN
GB     300.0
GER      NaN
HUN    700.0
US     300.0
dtype: float64

Data frames

columns = ['W', 'X', 'Y', 'Z']
index = ['A', 'B', 'C', 'D', 'E']

from numpy.random import randint

np.random.seed(42) # generate the same random numbers
data = randint(-100, 100, (5,4))
data

array([[  2,  79,  -8, -86],
       [  6, -29,  88, -80],
       [  2,  21, -26, -13],
       [ 16,  -1,   3,  51],
       [ 30,  49, -48, -99]])

df = pd.DataFrame(data, index, columns)
df

df['W']

A     2
B     6
C     2
D    16
E    30
Name: W, dtype: int64

df['W']['A']

2

df[['W', 'Z']]

df['NEW'] = df['W'] + df['Y']
df

df = df.drop('NEW', axis=1)
df

# to select a row use loc
df.loc['A']

W     2
X    79
Y    -8
Z   -86
Name: A, dtype: int64

df.loc[['A', 'E']]

df.iloc[1:3]

df.loc[['A', 'C'], ['W', 'Y']]

Condition search

df > 0

df[df > 0] # NaN is missing data i.e. not true

df[ df['X'] > 0 ]

df[ df['X'] > 0 ]['W']

A     2
C     2
E    30
Name: W, dtype: int64

# & and -- | or

df[ (df['W']>0) & (df['Y']>1) ]

df.reset_index()

new_ind = ['CA', 'NY', 'WY', 'OR', 'CO']
df['States'] = new_ind
df

df = df.set_index('States')
df

df.columns

Index(['W', 'X', 'Y', 'Z'], dtype='object')

df.describe()

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, CA to CO
Data columns (total 4 columns):
W    5 non-null int64
X    5 non-null int64
Y    5 non-null int64
Z    5 non-null int64
dtypes: int64(4)
memory usage: 200.0+ bytes

Missing data

df = pd.DataFrame( {'A' : [1, 2, np.nan, 4],
                    'B' : [5, np.nan, np.nan, 8],
                    'C' : [10, 20, 30, 40]})
df

df.dropna() # rows where nothing is missing

df.dropna(axis=1) # cols where nothing is missing

# cols where less nan than threshold (thresh is percentage 1/thresh)
# if thresh = 2 than 50%

df.dropna(axis=1, thresh=3)

# filling nan
df.fillna(value=0)

df['A'] = df['A'].fillna(value=0)

df

df['B'].fillna(df['B'].mean())

0    5.0
1    6.5
2    6.5
3    8.0
Name: B, dtype: float64

GroupBy operations

df = pd.read_csv('Universities.csv')
df.head() # first few elements

df.groupby('Year').sum()

df.groupby('Year').sum().sort_index(ascending=False) # csokkenoben evszamok szerint

df.groupby(['Year', 'Sector']).sum()

df.groupby('Year').describe()

Pandas operations

df_one = pd.DataFrame({'k1':['A', 'A', 'B', 'B', 'C', 'C'],
                       'col1':[100, 200, 300, 300, 400, 500],
                       'col2':['NY', 'CA', 'WA', 'WA', 'AK', 'NV']})
df_one

#unique values
df_one['col2'].unique()

array(['NY', 'CA', 'WA', 'AK', 'NV'], dtype=object)

# number of unique values
df_one['col2'].nunique()

5

df_one['col2'].value_counts()

WA    2
CA    1
NY    1
NV    1
AK    1
Name: col2, dtype: int64

df_one.drop_duplicates()

df_one['NEW'] = df_one['col1'] * 10
df_one

def grab_first_letter(state):
    return state[0]

grab_first_letter('NY')

'N'

df_one['col2'].apply(grab_first_letter)

0    N
1    C
2    W
3    W
4    A
5    N
Name: col2, dtype: object

df_one['first_letter'] = df_one['col2'].apply(grab_first_letter)
df_one

def complex_letter(state):
    if state[0] == 'W':
        return 'Washington'
    else:
        return 'Error'
    
df_one['col2'].apply(complex_letter)

0         Error
1         Error
2    Washington
3    Washington
4         Error
5         Error
Name: col2, dtype: object

# create new columns with mapping
my_map = {'A':1, 'B':2, 'C':3}
df_one['k1'].map(my_map)

0    1
1    1
2    2
3    2
4    3
5    3
Name: k1, dtype: int64

df_one['new_map'] = df_one['k1'].map(my_map)
df_one

# max values
print( df_one['col1'].max() )
print( df_one['col1'].idxmax() )

500
5

df_one.columns = ['c1', 'c2', 'c3', 'c4', 'c5', 'c6']
df_one

#sorting
df_one.sort_values('c3')

# concatenating

features = pd.DataFrame({'A':[100, 200, 300, 400, 500],
                         'B':[12, 13, 14, 15, 16]})

predictions = pd.DataFrame({'pred':[0, 1, 1, 0, 1]})

pd.concat([features, predictions], axis=1)

#one hot encoding
pd.get_dummies(df_one['c1'])

	W	X	Y	Z
A	True	True	False	False
B	True	False	True	False
C	True	True	False	False
D	True	False	True	True
E	True	True	False	False

	W	X	Y	Z
count	5.00000	5.000000	5.000000	5.000000
mean	11.20000	23.800000	1.800000	-45.400000
std	11.96662	42.109381	51.915316	63.366395
min	2.00000	-29.000000	-48.000000	-99.000000
25%	2.00000	-1.000000	-26.000000	-86.000000
50%	6.00000	21.000000	-8.000000	-80.000000
75%	16.00000	49.000000	3.000000	-13.000000
max	30.00000	79.000000	88.000000	51.000000

	Sector	University	Year	Completions	Geography
0	Private for-profit, 2-year	Pima Medical Institute-Las Vegas	2016	591	Nevada
1	Private for-profit, less-than 2-year	Healthcare Preparatory Institute	2016	28	Nevada
2	Private for-profit, less-than 2-year	Milan Institute-Las Vegas	2016	408	Nevada
3	Private for-profit, less-than 2-year	Utah College of Massage Therapy-Vegas	2016	240	Nevada
4	Public, 4-year or above	Western Nevada College	2016	960	Nevada

	Completions
Year
2012	20333
2013	21046
2014	24730
2015	26279
2016	26224

	Completions
Year
2016	26224
2015	26279
2014	24730
2013	21046
2012	20333

	W	X	Y	Z	NEW
A	2	79	-8	-86	-6
B	6	-29	88	-80	94
C	2	21	-26	-13	-24
D	16	-1	3	51	19
E	30	49	-48	-99	-18

	W	X	Y	Z
A	2	79.0	NaN	NaN
B	6	NaN	88.0	NaN
C	2	21.0	NaN	NaN
D	16	NaN	3.0	51.0
E	30	49.0	NaN	NaN

	k1	col1	col2	NEW
0	A	100	NY	1000
1	A	200	CA	2000
2	B	300	WA	3000
3	B	300	WA	3000
4	C	400	AK	4000
5	C	500	NV	5000

	A	B	C
0	1.0	5.0	10
3	4.0	8.0	40

	A	C
0	1.0	10
1	2.0	20
2	NaN	30
3	4.0	40

	A	B	C
0	1.0	5.0	10
1	2.0	0.0	20
2	0.0	0.0	30
3	4.0	8.0	40

	A	B	C
0	1.0	5.0	10
1	2.0	NaN	20
2	0.0	NaN	30
3	4.0	8.0	40

	W	X	Y	Z	NEW
A	2	79	-8	-86	-6
B	6	-29	88	-80	94
C	2	21	-26	-13	-24
D	16	-1	3	51	19
E	30	49	-48	-99	-18

	W	X	Y	Z	NEW
A	2	79	-8	-86	-6
B	6	-29	88	-80	94
C	2	21	-26	-13	-24
D	16	-1	3	51	19
E	30	49	-48	-99	-18