import numpy as np
import pandas as pd
Series
arr = np.array([10, 20, 30])
labes = ['a', 'b', 'c']
pd.Series(arr, labes)
sales1 = pd.Series( [100, 200, 150, 250], ['US', 'HUN', "GB", "GER"] )
sales2 = pd.Series( [200, 500, 150, 250], ['US', 'HUN', "GB", "FR"] )
sales1
print( sales1[0] )
print( sales1['US'] )
sales1 + sales2 # nan because the item only show in one of the series
Data frames
columns = ['W', 'X', 'Y', 'Z']
index = ['A', 'B', 'C', 'D', 'E']
from numpy.random import randint
np.random.seed(42) # generate the same random numbers
data = randint(-100, 100, (5,4))
data
df = pd.DataFrame(data, index, columns)
df
df['W']
df['W']['A']
df[['W', 'Z']]
df['NEW'] = df['W'] + df['Y']
df
df = df.drop('NEW', axis=1)
df
# to select a row use loc
df.loc['A']
df.loc[['A', 'E']]
df.iloc[1:3]
df.loc[['A', 'C'], ['W', 'Y']]
Condition search
df > 0
df[df > 0] # NaN is missing data i.e. not true
df[ df['X'] > 0 ]
df[ df['X'] > 0 ]['W']
# & and -- | or
df[ (df['W']>0) & (df['Y']>1) ]
df.reset_index()
new_ind = ['CA', 'NY', 'WY', 'OR', 'CO']
df['States'] = new_ind
df
df = df.set_index('States')
df
df.columns
df.describe()
df.info()
Missing data
df = pd.DataFrame( {'A' : [1, 2, np.nan, 4],
'B' : [5, np.nan, np.nan, 8],
'C' : [10, 20, 30, 40]})
df
df.dropna() # rows where nothing is missing
df.dropna(axis=1) # cols where nothing is missing
# cols where less nan than threshold (thresh is percentage 1/thresh)
# if thresh = 2 than 50%
df.dropna(axis=1, thresh=3)
# filling nan
df.fillna(value=0)
df['A'] = df['A'].fillna(value=0)
df
df['B'].fillna(df['B'].mean())
GroupBy operations
df = pd.read_csv('Universities.csv')
df.head() # first few elements
df.groupby('Year').sum()
df.groupby('Year').sum().sort_index(ascending=False) # csokkenoben evszamok szerint
df.groupby(['Year', 'Sector']).sum()
df.groupby('Year').describe()
Pandas operations
df_one = pd.DataFrame({'k1':['A', 'A', 'B', 'B', 'C', 'C'],
'col1':[100, 200, 300, 300, 400, 500],
'col2':['NY', 'CA', 'WA', 'WA', 'AK', 'NV']})
df_one
#unique values
df_one['col2'].unique()
# number of unique values
df_one['col2'].nunique()
df_one['col2'].value_counts()
df_one.drop_duplicates()
df_one['NEW'] = df_one['col1'] * 10
df_one
def grab_first_letter(state):
return state[0]
grab_first_letter('NY')
df_one['col2'].apply(grab_first_letter)
df_one['first_letter'] = df_one['col2'].apply(grab_first_letter)
df_one
def complex_letter(state):
if state[0] == 'W':
return 'Washington'
else:
return 'Error'
df_one['col2'].apply(complex_letter)
# create new columns with mapping
my_map = {'A':1, 'B':2, 'C':3}
df_one['k1'].map(my_map)
df_one['new_map'] = df_one['k1'].map(my_map)
df_one
# max values
print( df_one['col1'].max() )
print( df_one['col1'].idxmax() )
df_one.columns = ['c1', 'c2', 'c3', 'c4', 'c5', 'c6']
df_one
#sorting
df_one.sort_values('c3')
# concatenating
features = pd.DataFrame({'A':[100, 200, 300, 400, 500],
'B':[12, 13, 14, 15, 16]})
predictions = pd.DataFrame({'pred':[0, 1, 1, 0, 1]})
pd.concat([features, predictions], axis=1)
#one hot encoding
pd.get_dummies(df_one['c1'])