import pandas as pd
import numpy as np
from pandas import Series, DataFrame- helps in numerical computing (NumPy, SciPy)
- helps with analytical libraries (scikit-learn, and data visualizatioon,
- processes data without for loops
Data structures
- Series
- Data Frames
- index objects
Functionality
- Reindexing
- Dropping entreies from axis
- indexing, selection, and filtering
- DataFrame selection with loc and iloc
- integer indexing pitfalls
- pitfalls with chained indexing
- artihmetic and data alignment
- arithmetic methods with fill values
- Operations between DataFrame and Series
- Function applicaiton and mapping
- Sorting and Ranking
- Axis indexed with dupicate labels # Summarizing and Descriptive statistics
- correlation and variance
- unique values, counts, and memberships
Series
obj = pd.Series([4,2, 312, -3])
obj0 4
1 2
2 312
3 -3
dtype: int64
obj2 = pd.Series([4,2, 312, -3], index = ['a', 'b', 'c', 'd'])
obj2a 4
b 2
c 312
d -3
dtype: int64
obj2.indexIndex(['a', 'b', 'c', 'd'], dtype='object')
obj2[obj2 > 0]a 4
b 2
c 312
dtype: int64
np.exp(obj2)a 5.459815e+01
b 7.389056e+00
c 3.161392e+135
d 4.978707e-02
dtype: float64
'b' in obj2True
'e' in obj2False
sdata = {'ohio': 232, 'Texas': 332, 'Oregon': 34343}
obj3 = pd.Series(sdata)
obj3ohio 232
Texas 332
Oregon 34343
dtype: int64
obj3.to_dict(){'ohio': 232, 'Texas': 332, 'Oregon': 34343}
states = ['California', 'ohio', 'orgeon']
obj4 = pd.Series(sdata, index = states)
obj4California NaN
ohio 232.0
orgeon NaN
dtype: float64
pd.isna(obj4) # is nullCalifornia True
ohio False
orgeon True
dtype: bool
pd.notna(obj4) #not nullCalifornia False
ohio True
orgeon False
dtype: bool
obj3 + obj4California NaN
Oregon NaN
Texas NaN
ohio 464.0
orgeon NaN
dtype: float64
obj4.name = 'population'
obj4.index.name = 'state'
obj4state
California NaN
ohio 232.0
orgeon NaN
Name: population, dtype: float64
obj0 4
1 2
2 312
3 -3
dtype: int64
# altering the index in place
obj.index = ['Kunal', 'Rahul', 'Raghav', 'Ryan']
objKunal 4
Rahul 2
Raghav 312
Ryan -3
dtype: int64
DataFrame
data = {'state': ['ohio', 'ohio', 'nevada',],
'year': [2000, 2001, 2002,],
'pop': [1.2, 1.3, 1.4,]}
frame = pd.DataFrame(data)frame| state | year | pop | |
|---|---|---|---|
| 0 | ohio | 2000 | 1.2 |
| 1 | ohio | 2001 | 1.3 |
| 2 | nevada | 2002 | 1.4 |
frame.head()| state | year | pop | |
|---|---|---|---|
| 0 | ohio | 2000 | 1.2 |
| 1 | ohio | 2001 | 1.3 |
| 2 | nevada | 2002 | 1.4 |
frame.tail()| state | year | pop | |
|---|---|---|---|
| 0 | ohio | 2000 | 1.2 |
| 1 | ohio | 2001 | 1.3 |
| 2 | nevada | 2002 | 1.4 |
# passing another column in the dataframe
frame2 = pd.DataFrame(data, columns = ['state', 'year', 'pop', 'debt'])
frame2| state | year | pop | debt | |
|---|---|---|---|---|
| 0 | ohio | 2000 | 1.2 | NaN |
| 1 | ohio | 2001 | 1.3 | NaN |
| 2 | nevada | 2002 | 1.4 | NaN |
# changing the order of columns
frame2 = pd.DataFrame(data, columns = [ 'year', 'pop', 'debt', 'state'])frame2| state | year | pop | debt | |
|---|---|---|---|---|
| 0 | ohio | 2000 | 1.2 | NaN |
| 1 | ohio | 2001 | 1.3 | NaN |
| 2 | nevada | 2002 | 1.4 | NaN |
frame2.year0 2000
1 2001
2 2002
Name: year, dtype: int64
frame2.loc[1]year 2001
pop 1.3
debt NaN
state ohio
Name: 1, dtype: object
frame2.iloc[2]year 2002
pop 1.4
debt NaN
state nevada
Name: 2, dtype: object
frame2.pop<bound method DataFrame.pop of year pop debt state
0 2000 1.2 NaN ohio
1 2001 1.3 NaN ohio
2 2002 1.4 NaN nevada>
frame2.year0 2000
1 2001
2 2002
Name: year, dtype: int64
# assigning values
frame2['debt'] = 14.5
frame2| year | pop | debt | state | |
|---|---|---|---|---|
| 0 | 2000 | 1.2 | 14.5 | ohio |
| 1 | 2001 | 1.3 | 14.5 | ohio |
| 2 | 2002 | 1.4 | 14.5 | nevada |
# assiging a new column (resuls in new column if it does not exist before)
frame2['eastern'] = frame2['state'] =='ohio'
frame2| year | pop | debt | state | eastern | |
|---|---|---|---|---|---|
| 0 | 2000 | 1.2 | 14.5 | ohio | True |
| 1 | 2001 | 1.3 | 14.5 | ohio | True |
| 2 | 2002 | 1.4 | 14.5 | nevada | False |
# transposing
frame2.T| 0 | 1 | 2 | |
|---|---|---|---|
| year | 2000 | 2001 | 2002 |
| pop | 1.2 | 1.3 | 1.4 |
| debt | 14.5 | 14.5 | 14.5 |
| state | ohio | ohio | nevada |
| eastern | True | True | False |
pd.DataFrame(data)| state | year | pop | |
|---|---|---|---|
| 0 | ohio | 2000 | 1.2 |
| 1 | ohio | 2001 | 1.3 |
| 2 | nevada | 2002 | 1.4 |
frame2.index.name = 'year' frame2.columns.name = 'state' # starts with state columnframe2| state | year | pop | debt | state | eastern |
|---|---|---|---|---|---|
| year | |||||
| 0 | 2000 | 1.2 | 14.5 | ohio | True |
| 1 | 2001 | 1.3 | 14.5 | ohio | True |
| 2 | 2002 | 1.4 | 14.5 | nevada | False |
frame2.to_numpy()array([[2000, 1.2, 14.5, 'ohio', True],
[2001, 1.3, 14.5, 'ohio', True],
[2002, 1.4, 14.5, 'nevada', False]], dtype=object)
index objects
obj4 = pd.Series(np.arange(3), index = ['a', 'b', 'c'])
index = obj4.index
indexIndex(['a', 'b', 'c'], dtype='object')
index [1:]Index(['b', 'c'], dtype='object')
# index objects are immutable
index[1]= 'd' #type errorlabels = pd.Index(np.arange(3))
labelsIndex([0, 1, 2], dtype='int32')
obj2 = pd.Series([1.5, -2.5, 0], index = labels)
obj20 1.5
1 -2.5
2 0.0
dtype: float64
obj2.index is labelsTrue
frame2| state | year | pop | debt | state | eastern |
|---|---|---|---|---|---|
| year | |||||
| 0 | 2000 | 1.2 | 14.5 | ohio | True |
| 1 | 2001 | 1.3 | 14.5 | ohio | True |
| 2 | 2002 | 1.4 | 14.5 | nevada | False |
frame2.columnsIndex(['year', 'pop', 'debt', 'state', 'eastern'], dtype='object', name='state')
2003 in frame2.indexFalse
# unlike python, a pandas index can contain duplicate labels
pd.Index (['foo', 'boo', 'bar', 'baa', 'etc', 'foo'])
Index(['foo', 'boo', 'bar', 'baa', 'etc', 'foo'], dtype='object')
Reindexing
obj = pd.Series([4.5,48, -3,2,3.9], index= ['a', 'b', 'c', 'd', 'e'])
obja 4.5
b 48.0
c -3.0
d 2.0
e 3.9
dtype: float64
# reindexing
obj2 = obj.reindex(['b', 'a', 'c', 'd', 'e'])
obj2b 48.0
a 4.5
c -3.0
d 2.0
e 3.9
dtype: float64
# time series data fill
obj3 = pd.Series(['blue', 'purple', 'yellow'], index = [0, 2, 4])
obj30 blue
2 purple
4 yellow
dtype: object
# forward filling the values using ffill
obj3.reindex(np.arange(6), method='ffill')0 blue
1 blue
2 purple
3 purple
4 yellow
5 yellow
dtype: object
# backward fill
obj3.reindex(np.arange(6), method = 'bfill')0 blue
1 purple
2 purple
3 yellow
4 yellow
5 NaN
dtype: object
frame = pd.DataFrame(np.arange(9).reshape((3, 3)),
index = ['a', 'b', 'c'],
columns= ['ohio', 'texas', 'burmingham'])
frame| ohio | texas | burmingham | |
|---|---|---|---|
| a | 0 | 1 | 2 |
| b | 3 | 4 | 5 |
| c | 6 | 7 | 8 |
frame2 = frame.reindex(index=['a', 'b', 'c', 'd'])
frame2| ohio | texas | burmingham | |
|---|---|---|---|
| a | 0.0 | 1.0 | 2.0 |
| b | 3.0 | 4.0 | 5.0 |
| c | 6.0 | 7.0 | 8.0 |
| d | NaN | NaN | NaN |
# reindexing columns with column keyword
states = ['london', 'texus', 'surrey']
frame.reindex(columns = states)
| london | texus | surrey | |
|---|---|---|---|
| a | NaN | NaN | NaN |
| b | NaN | NaN | NaN |
| c | NaN | NaN | NaN |
Dropping entries from Axis
obj = pd.Series(np.arange(5.), index = ['a', 'b', 'c', 'd', 'e'])
obja 0.0
b 1.0
c 2.0
d 3.0
e 4.0
dtype: float64
new_obj = obj.drop('c')
new_obja 0.0
b 1.0
d 3.0
e 4.0
dtype: float64
obj.drop(['d', 'e'])a 0.0
b 1.0
c 2.0
dtype: float64
# in DataFrame
data = pd.DataFrame(np.arange(16).reshape((4,4)),
index=['québec', 'montréal', 'toronto', 'sainte-anne'],
columns = ['one', 'two', 'three', 'four'])
data| one | two | three | four | |
|---|---|---|---|---|
| québec | 0 | 1 | 2 | 3 |
| montréal | 4 | 5 | 6 | 7 |
| toronto | 8 | 9 | 10 | 11 |
| sainte-anne | 12 | 13 | 14 | 15 |
# using drop method
data.drop(index=['toronto', 'sainte-anne'])| one | two | three | four | |
|---|---|---|---|---|
| québec | 0 | 1 | 2 | 3 |
| montréal | 4 | 5 | 6 | 7 |
# dropping using axis method (axis = 1 = columns)
data.drop('two', axis=1)| one | three | four | |
|---|---|---|---|
| québec | 0 | 2 | 3 |
| montréal | 4 | 6 | 7 |
| toronto | 8 | 10 | 11 |
| sainte-anne | 12 | 14 | 15 |
data.drop(['three', 'four'], axis='columns')| one | two | |
|---|---|---|
| québec | 0 | 1 |
| montréal | 4 | 5 |
| toronto | 8 | 9 |
| sainte-anne | 12 | 13 |
Indexing, Selecting, and Filtering
obj = pd.Series(np.arange(4.), index= ['a', 'b', 'c', 'd'])
obja 0.0
b 1.0
c 2.0
d 3.0
dtype: float64
obj['b']1.0
obj[1]1.0
obj[2:4]c 2.0
d 3.0
dtype: float64
obj[obj<2]a 0.0
b 1.0
dtype: float64
obj.loc[['b', 'c']]b 1.0
c 2.0
dtype: float64
obj1 = pd.Series([1,2,3], index = [2,0,1])
obj2 = pd.Series([1,2,3], index = ['a', 'b', 'c'])
obj12 1
0 2
1 3
dtype: int64
obj2a 1
b 2
c 3
dtype: int64
# loc fails as index doesnot contain integers
obj2.loc[[0, 1]]# fix this
obj2.loc['b':'c']b 2
c 3
dtype: int64
# so, prefer using iloc with integers
obj1.iloc[[0,1,2]]2 1
0 2
1 3
dtype: int64
obj2.iloc[[0,1,2]]a 1
b 2
c 3
dtype: int64
# assigning values
obj2.loc['b':'c'] = 5
obj2a 1
b 5
c 5
dtype: int64
data| one | two | three | four | |
|---|---|---|---|---|
| québec | 0 | 1 | 2 | 3 |
| montréal | 4 | 5 | 6 | 7 |
| toronto | 8 | 9 | 10 | 11 |
| sainte-anne | 12 | 13 | 14 | 15 |
data[:2]| one | two | three | four | |
|---|---|---|---|---|
| québec | 0 | 1 | 2 | 3 |
| montréal | 4 | 5 | 6 | 7 |
# booleans
data < 5| one | two | three | four | |
|---|---|---|---|---|
| québec | True | True | True | True |
| montréal | True | False | False | False |
| toronto | False | False | False | False |
| sainte-anne | False | False | False | False |
# assigning values
data[data < 5] = 0
data| one | two | three | four | |
|---|---|---|---|---|
| québec | 0 | 0 | 0 | 0 |
| montréal | 0 | 5 | 6 | 7 |
| toronto | 8 | 9 | 10 | 11 |
| sainte-anne | 12 | 13 | 14 | 15 |
selection of DataFrame with loc and iloc
data| one | two | three | four | |
|---|---|---|---|---|
| québec | 0 | 0 | 0 | 0 |
| montréal | 0 | 5 | 6 | 7 |
| toronto | 8 | 9 | 10 | 11 |
| sainte-anne | 12 | 13 | 14 | 15 |
data.loc['montréal']one 0
two 5
three 6
four 7
Name: montréal, dtype: int32
data.loc[['montréal', 'québec']]| one | two | three | four | |
|---|---|---|---|---|
| montréal | 0 | 5 | 6 | 7 |
| québec | 0 | 0 | 0 | 0 |
data.loc['montréal', ['two', 'three']]two 5
three 6
Name: montréal, dtype: int32
# similar operations with iloc
data.iloc[2]one 8
two 9
three 10
four 11
Name: toronto, dtype: int32
data.iloc[[2,1]] #third row and second row| one | two | three | four | |
|---|---|---|---|---|
| toronto | 8 | 9 | 10 | 11 |
| montréal | 0 | 5 | 6 | 7 |
data.iloc[2,[3,0,1]] #third row (three elements in order)four 11
one 8
two 9
Name: toronto, dtype: int32
data.iloc[[1,2],[3,0,1]]| four | one | two | |
|---|---|---|---|
| montréal | 7 | 0 | 5 |
| toronto | 11 | 8 | 9 |
integer indexing pitfalls
series = pd.Series(np.arange(3.))series0 0.0
1 1.0
2 2.0
dtype: float64
# fails here but works fine with iloc and loc
series[-1]
# value error; key error: -1series.iloc[-1]2.0
# non-integer doesnot do this ambiguity
series2 = pd.Series(np.arange(3.0), index = ['a', 'b', 'c'])
series2[-1]2.0
Pitfalls with chained indexing
data.loc[:, 'one'] = 1
data| one | two | three | four | |
|---|---|---|---|---|
| québec | 1 | 0 | 0 | 0 |
| montréal | 1 | 5 | 6 | 7 |
| toronto | 1 | 9 | 10 | 11 |
| sainte-anne | 1 | 13 | 14 | 15 |
data.iloc[2] = 5data| one | two | three | four | |
|---|---|---|---|---|
| québec | 1 | 0 | 0 | 0 |
| montréal | 1 | 5 | 6 | 7 |
| toronto | 5 | 5 | 5 | 5 |
| sainte-anne | 1 | 13 | 14 | 15 |
data.loc[data['four'] > 5] = 3
data| one | two | three | four | |
|---|---|---|---|---|
| québec | 1 | 0 | 0 | 0 |
| montréal | 3 | 3 | 3 | 3 |
| toronto | 5 | 5 | 5 | 5 |
| sainte-anne | 3 | 3 | 3 | 3 |
# the data gets modified, but it is not the way that was asked for
# fixing it with loc operation
data.loc[data.three == 10, "three"] = 9
data| one | two | three | four | |
|---|---|---|---|---|
| québec | 1 | 0 | 0 | 0 |
| montréal | 3 | 3 | 3 | 3 |
| toronto | 5 | 5 | 5 | 5 |
| sainte-anne | 3 | 3 | 3 | 3 |
Arithmetic and Data Alignment
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index = ['a', 'c', 'd', 'e'])
s2 = pd.Series([1.2, -3, -.3, -.33, -43.2], index = ['e', 'j', 'o', 't', 'y'])
s1a 7.3
c -2.5
d 3.4
e 1.5
dtype: float64
s2e 1.20
j -3.00
o -0.30
t -0.33
y -43.20
dtype: float64
# adding these- missing values donot overlap
s1+s2a NaN
c NaN
d NaN
e 2.7
j NaN
o NaN
t NaN
y NaN
dtype: float64
# in case of DataFrame, alignment is performed on both rows and columns
df1 = pd.DataFrame(np.arange(9.).reshape((3,3)),
columns = list('abc'),
index = ['ferozpur', 'faridkot', 'montréal'])
df2 = pd.DataFrame(np.arange(12.).reshape((4,3)),
columns = list('abc'),
index = ['faridkot', 'toronto', 'québec', 'montréal'])df1| a | b | c | |
|---|---|---|---|
| ferozpur | 0.0 | 1.0 | 2.0 |
| faridkot | 3.0 | 4.0 | 5.0 |
| montréal | 6.0 | 7.0 | 8.0 |
df2| a | b | c | |
|---|---|---|---|
| faridkot | 0.0 | 1.0 | 2.0 |
| toronto | 3.0 | 4.0 | 5.0 |
| québec | 6.0 | 7.0 | 8.0 |
| montréal | 9.0 | 10.0 | 11.0 |
df1 + df2 #because the columns were same, it added those numbers| a | b | c | |
|---|---|---|---|
| faridkot | 3.0 | 5.0 | 7.0 |
| ferozpur | NaN | NaN | NaN |
| montréal | 15.0 | 17.0 | 19.0 |
| québec | NaN | NaN | NaN |
| toronto | NaN | NaN | NaN |
# changing columns names will give all NAN (null values)
df3 = pd.DataFrame(np.arange(12.).reshape((4,3)),
columns = list('xyz'),
index = ['faridkot', 'toronto', 'québec', 'montréal'])df1 + df3| a | b | c | x | y | z | |
|---|---|---|---|---|---|---|
| faridkot | NaN | NaN | NaN | NaN | NaN | NaN |
| ferozpur | NaN | NaN | NaN | NaN | NaN | NaN |
| montréal | NaN | NaN | NaN | NaN | NaN | NaN |
| québec | NaN | NaN | NaN | NaN | NaN | NaN |
| toronto | NaN | NaN | NaN | NaN | NaN | NaN |
Arithmetic methods with fill values
df2| a | b | c | |
|---|---|---|---|
| faridkot | 0.0 | 1.0 | 2.0 |
| toronto | 3.0 | 4.0 | 5.0 |
| québec | 6.0 | 7.0 | 8.0 |
| montréal | 9.0 | 10.0 | 11.0 |
df2.loc['faridkot', 'y'] = np.nandf2| a | b | c | y | |
|---|---|---|---|---|
| faridkot | 0.0 | 1.0 | 2.0 | NaN |
| toronto | 3.0 | 4.0 | 5.0 | NaN |
| québec | 6.0 | 7.0 | 8.0 | NaN |
| montréal | 9.0 | 10.0 | 11.0 | NaN |
help(pd.DataFrame._drop_axis)Help on function _drop_axis in module pandas.core.generic:
_drop_axis(self: 'NDFrameT', labels, axis, level=None, errors: 'IgnoreRaise' = 'raise', only_slice: 'bool_t' = False) -> 'NDFrameT'
Drop labels from specified axis. Used in the ``drop`` method
internally.
Parameters
----------
labels : single label or list-like
axis : int or axis name
level : int or level name, default None
For MultiIndex
errors : {'ignore', 'raise'}, default 'raise'
If 'ignore', suppress error and existing labels are dropped.
only_slice : bool, default False
Whether indexing along columns should be view-only.
help(pd.DataFrame.drop)print(dir(DataFrame))['T', '_AXIS_LEN', '_AXIS_ORDERS', '_AXIS_TO_AXIS_NUMBER', '_HANDLED_TYPES', '__abs__', '__add__', '__and__', '__annotations__', '__array__', '__array_priority__', '__array_ufunc__', '__bool__', '__class__', '__contains__', '__copy__', '__dataframe__', '__deepcopy__', '__delattr__', '__delitem__', '__dict__', '__dir__', '__divmod__', '__doc__', '__eq__', '__finalize__', '__floordiv__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__iadd__', '__iand__', '__ifloordiv__', '__imod__', '__imul__', '__init__', '__init_subclass__', '__invert__', '__ior__', '__ipow__', '__isub__', '__iter__', '__itruediv__', '__ixor__', '__le__', '__len__', '__lt__', '__matmul__', '__mod__', '__module__', '__mul__', '__ne__', '__neg__', '__new__', '__nonzero__', '__or__', '__pos__', '__pow__', '__radd__', '__rand__', '__rdivmod__', '__reduce__', '__reduce_ex__', '__repr__', '__rfloordiv__', '__rmatmul__', '__rmod__', '__rmul__', '__ror__', '__round__', '__rpow__', '__rsub__', '__rtruediv__', '__rxor__', '__setattr__', '__setitem__', '__setstate__', '__sizeof__', '__str__', '__sub__', '__subclasshook__', '__truediv__', '__weakref__', '__xor__', '_accessors', '_accum_func', '_add_numeric_operations', '_agg_examples_doc', '_agg_summary_and_see_also_doc', '_align_frame', '_align_series', '_append', '_arith_method', '_as_manager', '_box_col_values', '_can_fast_transpose', '_check_inplace_and_allows_duplicate_labels', '_check_inplace_setting', '_check_is_chained_assignment_possible', '_check_label_or_level_ambiguity', '_check_setitem_copy', '_clear_item_cache', '_clip_with_one_bound', '_clip_with_scalar', '_cmp_method', '_combine_frame', '_consolidate', '_consolidate_inplace', '_construct_axes_dict', '_construct_result', '_constructor', '_constructor_sliced', '_create_data_for_split_and_tight_to_dict', '_data', '_dir_additions', '_dir_deletions', '_dispatch_frame_op', '_drop_axis', '_drop_labels_or_levels', '_ensure_valid_index', '_find_valid_index', '_from_arrays', '_get_agg_axis', '_get_axis', '_get_axis_name', '_get_axis_number', '_get_axis_resolvers', '_get_block_manager_axis', '_get_bool_data', '_get_cleaned_column_resolvers', '_get_column_array', '_get_index_resolvers', '_get_item_cache', '_get_label_or_level_values', '_get_numeric_data', '_get_value', '_getitem_bool_array', '_getitem_multilevel', '_getitem_nocopy', '_gotitem', '_hidden_attrs', '_indexed_same', '_info_axis', '_info_axis_name', '_info_axis_number', '_info_repr', '_init_mgr', '_inplace_method', '_internal_names', '_internal_names_set', '_is_copy', '_is_homogeneous_type', '_is_label_or_level_reference', '_is_label_reference', '_is_level_reference', '_is_mixed_type', '_is_view', '_iset_item', '_iset_item_mgr', '_iset_not_inplace', '_iter_column_arrays', '_ixs', '_join_compat', '_logical_func', '_logical_method', '_maybe_cache_changed', '_maybe_update_cacher', '_metadata', '_min_count_stat_function', '_needs_reindex_multi', '_protect_consolidate', '_reduce', '_reduce_axis1', '_reindex_axes', '_reindex_columns', '_reindex_index', '_reindex_multi', '_reindex_with_indexers', '_rename', '_replace_columnwise', '_repr_data_resource_', '_repr_fits_horizontal_', '_repr_fits_vertical_', '_repr_html_', '_repr_latex_', '_reset_cache', '_reset_cacher', '_sanitize_column', '_series', '_set_axis', '_set_axis_name', '_set_axis_nocheck', '_set_is_copy', '_set_item', '_set_item_frame_value', '_set_item_mgr', '_set_value', '_setitem_array', '_setitem_frame', '_setitem_slice', '_slice', '_stat_axis', '_stat_axis_name', '_stat_axis_number', '_stat_function', '_stat_function_ddof', '_take', '_take_with_is_copy', '_to_dict_of_blocks', '_to_latex_via_styler', '_typ', '_update_inplace', '_validate_dtype', '_values', '_where', 'abs', 'add', 'add_prefix', 'add_suffix', 'agg', 'aggregate', 'align', 'all', 'any', 'apply', 'applymap', 'asfreq', 'asof', 'assign', 'astype', 'at', 'at_time', 'attrs', 'axes', 'backfill', 'between_time', 'bfill', 'bool', 'boxplot', 'clip', 'columns', 'combine', 'combine_first', 'compare', 'convert_dtypes', 'copy', 'corr', 'corrwith', 'count', 'cov', 'cummax', 'cummin', 'cumprod', 'cumsum', 'describe', 'diff', 'div', 'divide', 'dot', 'drop', 'drop_duplicates', 'droplevel', 'dropna', 'dtypes', 'duplicated', 'empty', 'eq', 'equals', 'eval', 'ewm', 'expanding', 'explode', 'ffill', 'fillna', 'filter', 'first', 'first_valid_index', 'flags', 'floordiv', 'from_dict', 'from_records', 'ge', 'get', 'groupby', 'gt', 'head', 'hist', 'iat', 'idxmax', 'idxmin', 'iloc', 'index', 'infer_objects', 'info', 'insert', 'interpolate', 'isetitem', 'isin', 'isna', 'isnull', 'items', 'iterrows', 'itertuples', 'join', 'keys', 'kurt', 'kurtosis', 'last', 'last_valid_index', 'le', 'loc', 'lt', 'mask', 'max', 'mean', 'median', 'melt', 'memory_usage', 'merge', 'min', 'mod', 'mode', 'mul', 'multiply', 'ndim', 'ne', 'nlargest', 'notna', 'notnull', 'nsmallest', 'nunique', 'pad', 'pct_change', 'pipe', 'pivot', 'pivot_table', 'plot', 'pop', 'pow', 'prod', 'product', 'quantile', 'query', 'radd', 'rank', 'rdiv', 'reindex', 'reindex_like', 'rename', 'rename_axis', 'reorder_levels', 'replace', 'resample', 'reset_index', 'rfloordiv', 'rmod', 'rmul', 'rolling', 'round', 'rpow', 'rsub', 'rtruediv', 'sample', 'select_dtypes', 'sem', 'set_axis', 'set_flags', 'set_index', 'shape', 'shift', 'size', 'skew', 'sort_index', 'sort_values', 'sparse', 'squeeze', 'stack', 'std', 'style', 'sub', 'subtract', 'sum', 'swapaxes', 'swaplevel', 'tail', 'take', 'to_clipboard', 'to_csv', 'to_dict', 'to_excel', 'to_feather', 'to_gbq', 'to_hdf', 'to_html', 'to_json', 'to_latex', 'to_markdown', 'to_numpy', 'to_orc', 'to_parquet', 'to_period', 'to_pickle', 'to_records', 'to_sql', 'to_stata', 'to_string', 'to_timestamp', 'to_xarray', 'to_xml', 'transform', 'transpose', 'truediv', 'truncate', 'tz_convert', 'tz_localize', 'unstack', 'update', 'value_counts', 'values', 'var', 'where', 'xs']
help(pd.DataFrame.describe)help(pd.DataFrame._drop_axis)df4 = df2df4| a | b | c | y | |
|---|---|---|---|---|
| faridkot | 0.0 | 1.0 | 2.0 | NaN |
| toronto | 3.0 | 4.0 | 5.0 | NaN |
| québec | 6.0 | 7.0 | 8.0 | NaN |
| montréal | 9.0 | 10.0 | 11.0 | NaN |
df1 + df4| a | b | c | y | |
|---|---|---|---|---|
| faridkot | 3.0 | 5.0 | 7.0 | NaN |
| ferozpur | NaN | NaN | NaN | NaN |
| montréal | 15.0 | 17.0 | 19.0 | NaN |
| québec | NaN | NaN | NaN | NaN |
| toronto | NaN | NaN | NaN | NaN |
df4.fill_value = 0
df4| a | b | c | y | |
|---|---|---|---|---|
| faridkot | 0.0 | 1.0 | 2.0 | NaN |
| toronto | 3.0 | 4.0 | 5.0 | NaN |
| québec | 6.0 | 7.0 | 8.0 | NaN |
| montréal | 9.0 | 10.0 | 11.0 | NaN |
1/df4| a | b | c | y | |
|---|---|---|---|---|
| faridkot | inf | 1.000000 | 0.500000 | NaN |
| toronto | 0.333333 | 0.250000 | 0.200000 | NaN |
| québec | 0.166667 | 0.142857 | 0.125000 | NaN |
| montréal | 0.111111 | 0.100000 | 0.090909 | NaN |
df4.rdiv(1)| a | b | c | y | |
|---|---|---|---|---|
| faridkot | inf | 1.000000 | 0.500000 | NaN |
| toronto | 0.333333 | 0.250000 | 0.200000 | NaN |
| québec | 0.166667 | 0.142857 | 0.125000 | NaN |
| montréal | 0.111111 | 0.100000 | 0.090909 | NaN |
df4.reindex(columns = df4.columns, fill_value=0) # not working| a | b | c | y | |
|---|---|---|---|---|
| faridkot | 0.0 | 1.0 | 2.0 | NaN |
| toronto | 3.0 | 4.0 | 5.0 | NaN |
| québec | 6.0 | 7.0 | 8.0 | NaN |
| montréal | 9.0 | 10.0 | 11.0 | NaN |
Operations between DataFrame and Series
arr = np.arange(12.).reshape((3,4))
arrarray([[ 0., 1., 2., 3.],
[ 4., 5., 6., 7.],
[ 8., 9., 10., 11.]])
arr[0]array([0., 1., 2., 3.])
# broadcasting
arr - arr[0] #subtracts from all rows array([[0., 0., 0., 0.],
[4., 4., 4., 4.],
[8., 8., 8., 8.]])
frame| ohio | texas | burmingham | |
|---|---|---|---|
| a | 0 | 1 | 2 |
| b | 3 | 4 | 5 |
| c | 6 | 7 | 8 |
help(pd.Series)series
series1 = pd.Series(data = np.arange(3), index = ['a', 'b', 'c'])
series1a 0
b 1
c 2
dtype: int32
frame-series1| a | b | burmingham | c | ohio | texas | |
|---|---|---|---|---|---|---|
| a | NaN | NaN | NaN | NaN | NaN | NaN |
| b | NaN | NaN | NaN | NaN | NaN | NaN |
| c | NaN | NaN | NaN | NaN | NaN | NaN |
frame + series2| a | b | burmingham | c | ohio | texas | |
|---|---|---|---|---|---|---|
| a | NaN | NaN | NaN | NaN | NaN | NaN |
| b | NaN | NaN | NaN | NaN | NaN | NaN |
| c | NaN | NaN | NaN | NaN | NaN | NaN |
Function application and ‘mapping’
frame2 = pd.DataFrame(np.random.standard_normal((4,3)),
columns = list('bde'),
index = ['utah', 'faridkot', 'shahkot', 'malsahian'])
frame2| b | d | e | |
|---|---|---|---|
| utah | -1.549165 | 0.443756 | 1.013167 |
| faridkot | 1.130587 | -1.289388 | -1.210530 |
| shahkot | 1.195553 | 0.274397 | 0.510043 |
| malsahian | 0.713024 | -1.223282 | 1.857681 |
np.abs(frame2) #converts non-negative values to positive| b | d | e | |
|---|---|---|---|
| utah | 1.549165 | 0.443756 | 1.013167 |
| faridkot | 1.130587 | 1.289388 | 1.210530 |
| shahkot | 1.195553 | 0.274397 | 0.510043 |
| malsahian | 0.713024 | 1.223282 | 1.857681 |
help(np.abs)Help on ufunc:
absolute = <ufunc 'absolute'>
absolute(x, /, out=None, *, where=True, casting='same_kind', order='K', dtype=None, subok=True[, signature, extobj])
Calculate the absolute value element-wise.
``np.abs`` is a shorthand for this function.
Parameters
----------
x : array_like
Input array.
out : ndarray, None, or tuple of ndarray and None, optional
A location into which the result is stored. If provided, it must have
a shape that the inputs broadcast to. If not provided or None,
a freshly-allocated array is returned. A tuple (possible only as a
keyword argument) must have length equal to the number of outputs.
where : array_like, optional
This condition is broadcast over the input. At locations where the
condition is True, the `out` array will be set to the ufunc result.
Elsewhere, the `out` array will retain its original value.
Note that if an uninitialized `out` array is created via the default
``out=None``, locations within it where the condition is False will
remain uninitialized.
**kwargs
For other keyword-only arguments, see the
:ref:`ufunc docs <ufuncs.kwargs>`.
Returns
-------
absolute : ndarray
An ndarray containing the absolute value of
each element in `x`. For complex input, ``a + ib``, the
absolute value is :math:`\sqrt{ a^2 + b^2 }`.
This is a scalar if `x` is a scalar.
Examples
--------
>>> x = np.array([-1.2, 1.2])
>>> np.absolute(x)
array([ 1.2, 1.2])
>>> np.absolute(1.2 + 1j)
1.5620499351813308
Plot the function over ``[-10, 10]``:
>>> import matplotlib.pyplot as plt
>>> x = np.linspace(start=-10, stop=10, num=101)
>>> plt.plot(x, np.absolute(x))
>>> plt.show()
Plot the function over the complex plane:
>>> xx = x + 1j * x[:, np.newaxis]
>>> plt.imshow(np.abs(xx), extent=[-10, 10, -10, 10], cmap='gray')
>>> plt.show()
The `abs` function can be used as a shorthand for ``np.absolute`` on
ndarrays.
>>> x = np.array([-1.2, 1.2])
>>> abs(x)
array([1.2, 1.2])
#DataFrame's apply method
def f1(x):
return x.max() - x.min()
frame2.apply(f1)b 2.744718
d 1.733145
e 3.068212
dtype: float64
# applying across columns
frame2.apply(f1, axis = 'columns')utah 2.562332
faridkot 2.419976
shahkot 0.921156
malsahian 3.080963
dtype: float64
# modifying the function to return Series with multiple values
def f2(x):
return pd.Series([x.min(), x.max()], index= ['min', 'max'])
frame.apply(f2)| ohio | texas | burmingham | |
|---|---|---|---|
| min | 0 | 1 | 2 |
| max | 6 | 7 | 8 |
frame| ohio | texas | burmingham | |
|---|---|---|---|
| a | 0 | 1 | 2 |
| b | 3 | 4 | 5 |
| c | 6 | 7 | 8 |
frame2.apply(f2)| b | d | e | |
|---|---|---|---|
| min | -1.549165 | -1.289388 | -1.210530 |
| max | 1.195553 | 0.443756 | 1.857681 |
# apply map function
def my_format(x):
return f"{x:.2f}"
frame2.applymap(my_format)| b | d | e | |
|---|---|---|---|
| utah | -1.55 | 0.44 | 1.01 |
| faridkot | 1.13 | -1.29 | -1.21 |
| shahkot | 1.20 | 0.27 | 0.51 |
| malsahian | 0.71 | -1.22 | 1.86 |
# applying map function in Series
frame2['e'].map(my_format)utah 1.01
faridkot -1.21
shahkot 0.51
malsahian 1.86
Name: e, dtype: object
Sorting and Ranking
obj2 = pd.Series(np.arange(4), index = ['d', 'a', 'b', 'c'])
obj2d 0
a 1
b 2
c 3
dtype: int32
obj2.sort_index()a 1
b 2
c 3
d 0
dtype: int32
# sorting in DataFrame can be done with either axis
frame = pd.DataFrame(np.arange(8).reshape((2, 4)),
index = ['three', 'one'],
columns = ['d', 'a', 'b', 'c'])
frame
| d | a | b | c | |
|---|---|---|---|---|
| three | 0 | 1 | 2 | 3 |
| one | 4 | 5 | 6 | 7 |
frame.sort_index()| d | a | b | c | |
|---|---|---|---|---|
| one | 4 | 5 | 6 | 7 |
| three | 0 | 1 | 2 | 3 |
frame.sort_index(axis= 'columns')| a | b | c | d | |
|---|---|---|---|---|
| three | 1 | 2 | 3 | 0 |
| one | 5 | 6 | 7 | 4 |
# data can be stored in descending order aswell
frame.sort_index(axis = 'columns', ascending = False)| d | c | b | a | |
|---|---|---|---|---|
| three | 0 | 3 | 2 | 1 |
| one | 4 | 7 | 6 | 5 |
# sorting a series by its values
obj = pd.Series([4, 7, -3, -2])
obj.sort_values()2 -3
3 -2
0 4
1 7
dtype: int64
# missing values get sorted to the end by default
obj = pd.Series([4, 3, 4, np.nan, 33, np.nan, -3, 3])
obj.sort_values()6 -3.0
1 3.0
7 3.0
0 4.0
2 4.0
4 33.0
3 NaN
5 NaN
dtype: float64
# using na_position to bring missing values to the front
obj.sort_values(na_position= 'first')3 NaN
5 NaN
6 -3.0
1 3.0
7 3.0
0 4.0
2 4.0
4 33.0
dtype: float64
# while sorting a DataFrame
frame = pd.DataFrame({'b': [1, 2, 3, 4, 5], 'a':[3, 43, 33, 1, 5]})
frame| b | a | |
|---|---|---|
| 0 | 1 | 3 |
| 1 | 2 | 43 |
| 2 | 3 | 33 |
| 3 | 4 | 1 |
| 4 | 5 | 5 |
frame.sort_values('a')| b | a | |
|---|---|---|
| 3 | 4 | 1 |
| 0 | 1 | 3 |
| 4 | 5 | 5 |
| 2 | 3 | 33 |
| 1 | 2 | 43 |
# ranking
obj = pd.Series([4, 5, -5, 7, 8, 0, 4])
obj.rank()0 3.5
1 5.0
2 1.0
3 6.0
4 7.0
5 2.0
6 3.5
dtype: float64
# ranking in order the data is observed
obj.rank(method='first')0 3.0
1 5.0
2 1.0
3 6.0
4 7.0
5 2.0
6 4.0
dtype: float64
obj.rank(ascending = False)0 4.5
1 3.0
2 7.0
3 2.0
4 1.0
5 6.0
6 4.5
dtype: float64
# DataFrame for rank computation
frame| b | a | |
|---|---|---|
| 0 | 1 | 3 |
| 1 | 2 | 43 |
| 2 | 3 | 33 |
| 3 | 4 | 1 |
| 4 | 5 | 5 |
frame.rank(axis = 'columns')| b | a | |
|---|---|---|
| 0 | 1.0 | 2.0 |
| 1 | 1.0 | 2.0 |
| 2 | 1.0 | 2.0 |
| 3 | 2.0 | 1.0 |
| 4 | 1.5 | 1.5 |
Axis indices with duplicate labels
obj = pd.Series(np.arange(5), index=['a', 'a', 'b','b', 'c' ])
obja 0
a 1
b 2
b 3
c 4
dtype: int32
obj.index.is_uniqueFalse
obj['a']a 0
a 1
dtype: int32
obj['c']4
# DataFrame
df = pd.DataFrame(np.random.standard_normal((5, 3)),
index = ['a', 'a', 'b', 'c', 'b'])
df| 0 | 1 | 2 | |
|---|---|---|---|
| a | -1.817751 | 0.915854 | -0.389590 |
| a | 0.603020 | 0.573012 | 1.070691 |
| b | -0.903033 | 1.109707 | 0.874381 |
| c | 2.529357 | -1.169854 | 0.676702 |
| b | -0.368763 | 0.723758 | 0.375079 |
df.loc['b']| 0 | 1 | 2 | |
|---|---|---|---|
| b | -0.903033 | 1.109707 | 0.874381 |
| b | -0.368763 | 0.723758 | 0.375079 |
df.loc['c']0 2.529357
1 -1.169854
2 0.676702
Name: c, dtype: float64
Descriptive statistics
df.sum()0 0.042830
1 2.152477
2 2.607263
dtype: float64
df.sum(axis = 'columns')a -1.291487
a 2.246723
b 1.081055
c 2.036205
b 0.730074
dtype: float64
df.describe()| 0 | 1 | 2 | |
|---|---|---|---|
| count | 5.000000 | 5.000000 | 5.000000 |
| mean | 0.008566 | 0.430495 | 0.521453 |
| std | 1.659562 | 0.917106 | 0.570471 |
| min | -1.817751 | -1.169854 | -0.389590 |
| 25% | -0.903033 | 0.573012 | 0.375079 |
| 50% | -0.368763 | 0.723758 | 0.676702 |
| 75% | 0.603020 | 0.915854 | 0.874381 |
| max | 2.529357 | 1.109707 | 1.070691 |
df.cumsum()| 0 | 1 | 2 | |
|---|---|---|---|
| a | -1.817751 | 0.915854 | -0.389590 |
| a | -1.214731 | 1.488866 | 0.681101 |
| b | -2.117763 | 2.598573 | 1.555481 |
| c | 0.411594 | 1.428719 | 2.232183 |
| b | 0.042830 | 2.152477 | 2.607263 |
Unique Values, Value counts, and Membership
obj = pd.Series(['c', 'd', 'a', 'b', 'n', 'm', 'g', "k", 'b', 'c', 'k'])
uniques = obj.unique()
uniquesarray(['c', 'd', 'a', 'b', 'n', 'm', 'g', 'k'], dtype=object)
obj.value_counts()c 2
b 2
k 2
d 1
a 1
n 1
m 1
g 1
Name: count, dtype: int64
pd.value_counts(obj.to_numpy(), sort = False)c 2
d 1
a 1
b 2
n 1
m 1
g 1
k 2
Name: count, dtype: int64
# 'isin' is used for vectorized set memebership
obj0 c
1 d
2 a
3 b
4 n
5 m
6 g
7 k
8 b
9 c
10 k
dtype: object
mask = obj.isin(['b', 'c'])
mask0 True
1 False
2 False
3 True
4 False
5 False
6 False
7 False
8 True
9 True
10 False
dtype: bool
obj[mask]0 c
3 b
8 b
9 c
dtype: object
to_match = pd.Series(['c', 'a', 'b', 'b', 'c'])
unique_vals = pd.Series(['c', 'b', 'a'])
indices = pd.Index(unique_vals).get_indexer(to_match)
indicesarray([0, 2, 1, 1, 0], dtype=int64)
data = pd.DataFrame({'ça va': [1, 2, 3, 4, 5],
'oui, ça va' : [43,3, 2, 4, 2],
'et toi': [3, 2, 44, 1, 5]})
data| ça va | oui, ça va | et toi | |
|---|---|---|---|
| 0 | 1 | 43 | 3 |
| 1 | 2 | 3 | 2 |
| 2 | 3 | 2 | 44 |
| 3 | 4 | 4 | 1 |
| 4 | 5 | 2 | 5 |
# computing value counts
data['ça va'].value_counts().sort_index()ça va
1 1
2 1
3 1
4 1
5 1
Name: count, dtype: int64
data['et toi'].value_counts().sort_index()et toi
1 1
2 1
3 1
5 1
44 1
Name: count, dtype: int64
result = data.apply(pd.value_counts).fillna(0)
result| ça va | oui, ça va | et toi | |
|---|---|---|---|
| 1 | 1.0 | 0.0 | 1.0 |
| 2 | 1.0 | 2.0 | 1.0 |
| 3 | 1.0 | 1.0 | 1.0 |
| 4 | 1.0 | 1.0 | 0.0 |
| 5 | 1.0 | 0.0 | 1.0 |
| 43 | 0.0 | 1.0 | 0.0 |
| 44 | 0.0 | 0.0 | 1.0 |