import pandas as pd
import numpy as np
from pandas import Series, DataFrame
- helps in numerical computing (NumPy, SciPy)
- helps with analytical libraries (scikit-learn, and data visualizatioon,
- processes data without for loops
Data structures
- Series
- Data Frames
- index objects
Functionality
- Reindexing
- Dropping entreies from axis
- indexing, selection, and filtering
- DataFrame selection with loc and iloc
- integer indexing pitfalls
- pitfalls with chained indexing
- artihmetic and data alignment
- arithmetic methods with fill values
- Operations between DataFrame and Series
- Function applicaiton and mapping
- Sorting and Ranking
- Axis indexed with dupicate labels # Summarizing and Descriptive statistics
- correlation and variance
- unique values, counts, and memberships
Series
= pd.Series([4,2, 312, -3])
obj
obj
0 4
1 2
2 312
3 -3
dtype: int64
= pd.Series([4,2, 312, -3], index = ['a', 'b', 'c', 'd'])
obj2
obj2
a 4
b 2
c 312
d -3
dtype: int64
obj2.index
Index(['a', 'b', 'c', 'd'], dtype='object')
> 0] obj2[obj2
a 4
b 2
c 312
dtype: int64
np.exp(obj2)
a 5.459815e+01
b 7.389056e+00
c 3.161392e+135
d 4.978707e-02
dtype: float64
'b' in obj2
True
'e' in obj2
False
= {'ohio': 232, 'Texas': 332, 'Oregon': 34343}
sdata
= pd.Series(sdata)
obj3
obj3
ohio 232
Texas 332
Oregon 34343
dtype: int64
obj3.to_dict()
{'ohio': 232, 'Texas': 332, 'Oregon': 34343}
= ['California', 'ohio', 'orgeon']
states
= pd.Series(sdata, index = states)
obj4
obj4
California NaN
ohio 232.0
orgeon NaN
dtype: float64
# is null pd.isna(obj4)
California True
ohio False
orgeon True
dtype: bool
#not null pd.notna(obj4)
California False
ohio True
orgeon False
dtype: bool
+ obj4 obj3
California NaN
Oregon NaN
Texas NaN
ohio 464.0
orgeon NaN
dtype: float64
= 'population'
obj4.name
= 'state'
obj4.index.name
obj4
state
California NaN
ohio 232.0
orgeon NaN
Name: population, dtype: float64
obj
0 4
1 2
2 312
3 -3
dtype: int64
# altering the index in place
= ['Kunal', 'Rahul', 'Raghav', 'Ryan']
obj.index
obj
Kunal 4
Rahul 2
Raghav 312
Ryan -3
dtype: int64
DataFrame
= {'state': ['ohio', 'ohio', 'nevada',],
data 'year': [2000, 2001, 2002,],
'pop': [1.2, 1.3, 1.4,]}
= pd.DataFrame(data) frame
frame
state | year | pop | |
---|---|---|---|
0 | ohio | 2000 | 1.2 |
1 | ohio | 2001 | 1.3 |
2 | nevada | 2002 | 1.4 |
frame.head()
state | year | pop | |
---|---|---|---|
0 | ohio | 2000 | 1.2 |
1 | ohio | 2001 | 1.3 |
2 | nevada | 2002 | 1.4 |
frame.tail()
state | year | pop | |
---|---|---|---|
0 | ohio | 2000 | 1.2 |
1 | ohio | 2001 | 1.3 |
2 | nevada | 2002 | 1.4 |
# passing another column in the dataframe
= pd.DataFrame(data, columns = ['state', 'year', 'pop', 'debt'])
frame2
frame2
state | year | pop | debt | |
---|---|---|---|---|
0 | ohio | 2000 | 1.2 | NaN |
1 | ohio | 2001 | 1.3 | NaN |
2 | nevada | 2002 | 1.4 | NaN |
# changing the order of columns
= pd.DataFrame(data, columns = [ 'year', 'pop', 'debt', 'state']) frame2
frame2
state | year | pop | debt | |
---|---|---|---|---|
0 | ohio | 2000 | 1.2 | NaN |
1 | ohio | 2001 | 1.3 | NaN |
2 | nevada | 2002 | 1.4 | NaN |
frame2.year
0 2000
1 2001
2 2002
Name: year, dtype: int64
1] frame2.loc[
year 2001
pop 1.3
debt NaN
state ohio
Name: 1, dtype: object
2] frame2.iloc[
year 2002
pop 1.4
debt NaN
state nevada
Name: 2, dtype: object
frame2.pop
<bound method DataFrame.pop of year pop debt state
0 2000 1.2 NaN ohio
1 2001 1.3 NaN ohio
2 2002 1.4 NaN nevada>
frame2.year
0 2000
1 2001
2 2002
Name: year, dtype: int64
# assigning values
'debt'] = 14.5
frame2[
frame2
year | pop | debt | state | |
---|---|---|---|---|
0 | 2000 | 1.2 | 14.5 | ohio |
1 | 2001 | 1.3 | 14.5 | ohio |
2 | 2002 | 1.4 | 14.5 | nevada |
# assiging a new column (resuls in new column if it does not exist before)
'eastern'] = frame2['state'] =='ohio'
frame2[
frame2
year | pop | debt | state | eastern | |
---|---|---|---|---|---|
0 | 2000 | 1.2 | 14.5 | ohio | True |
1 | 2001 | 1.3 | 14.5 | ohio | True |
2 | 2002 | 1.4 | 14.5 | nevada | False |
# transposing
frame2.T
0 | 1 | 2 | |
---|---|---|---|
year | 2000 | 2001 | 2002 |
pop | 1.2 | 1.3 | 1.4 |
debt | 14.5 | 14.5 | 14.5 |
state | ohio | ohio | nevada |
eastern | True | True | False |
pd.DataFrame(data)
state | year | pop | |
---|---|---|---|
0 | ohio | 2000 | 1.2 |
1 | ohio | 2001 | 1.3 |
2 | nevada | 2002 | 1.4 |
= 'year' frame2.index.name
= 'state' # starts with state column frame2.columns.name
frame2
state | year | pop | debt | state | eastern |
---|---|---|---|---|---|
year | |||||
0 | 2000 | 1.2 | 14.5 | ohio | True |
1 | 2001 | 1.3 | 14.5 | ohio | True |
2 | 2002 | 1.4 | 14.5 | nevada | False |
frame2.to_numpy()
array([[2000, 1.2, 14.5, 'ohio', True],
[2001, 1.3, 14.5, 'ohio', True],
[2002, 1.4, 14.5, 'nevada', False]], dtype=object)
index objects
= pd.Series(np.arange(3), index = ['a', 'b', 'c'])
obj4
= obj4.index
index
index
Index(['a', 'b', 'c'], dtype='object')
1:] index [
Index(['b', 'c'], dtype='object')
# index objects are immutable
1]= 'd' #type error index[
= pd.Index(np.arange(3))
labels
labels
Index([0, 1, 2], dtype='int32')
= pd.Series([1.5, -2.5, 0], index = labels)
obj2
obj2
0 1.5
1 -2.5
2 0.0
dtype: float64
is labels obj2.index
True
frame2
state | year | pop | debt | state | eastern |
---|---|---|---|---|---|
year | |||||
0 | 2000 | 1.2 | 14.5 | ohio | True |
1 | 2001 | 1.3 | 14.5 | ohio | True |
2 | 2002 | 1.4 | 14.5 | nevada | False |
frame2.columns
Index(['year', 'pop', 'debt', 'state', 'eastern'], dtype='object', name='state')
2003 in frame2.index
False
# unlike python, a pandas index can contain duplicate labels
'foo', 'boo', 'bar', 'baa', 'etc', 'foo'])
pd.Index ([
Index(['foo', 'boo', 'bar', 'baa', 'etc', 'foo'], dtype='object')
Reindexing
= pd.Series([4.5,48, -3,2,3.9], index= ['a', 'b', 'c', 'd', 'e'])
obj
obj
a 4.5
b 48.0
c -3.0
d 2.0
e 3.9
dtype: float64
# reindexing
= obj.reindex(['b', 'a', 'c', 'd', 'e'])
obj2
obj2
b 48.0
a 4.5
c -3.0
d 2.0
e 3.9
dtype: float64
# time series data fill
= pd.Series(['blue', 'purple', 'yellow'], index = [0, 2, 4])
obj3
obj3
0 blue
2 purple
4 yellow
dtype: object
# forward filling the values using ffill
6), method='ffill') obj3.reindex(np.arange(
0 blue
1 blue
2 purple
3 purple
4 yellow
5 yellow
dtype: object
# backward fill
6), method = 'bfill') obj3.reindex(np.arange(
0 blue
1 purple
2 purple
3 yellow
4 yellow
5 NaN
dtype: object
= pd.DataFrame(np.arange(9).reshape((3, 3)),
frame = ['a', 'b', 'c'],
index = ['ohio', 'texas', 'burmingham'])
columns
frame
ohio | texas | burmingham | |
---|---|---|---|
a | 0 | 1 | 2 |
b | 3 | 4 | 5 |
c | 6 | 7 | 8 |
= frame.reindex(index=['a', 'b', 'c', 'd'])
frame2
frame2
ohio | texas | burmingham | |
---|---|---|---|
a | 0.0 | 1.0 | 2.0 |
b | 3.0 | 4.0 | 5.0 |
c | 6.0 | 7.0 | 8.0 |
d | NaN | NaN | NaN |
# reindexing columns with column keyword
= ['london', 'texus', 'surrey']
states
= states)
frame.reindex(columns
london | texus | surrey | |
---|---|---|---|
a | NaN | NaN | NaN |
b | NaN | NaN | NaN |
c | NaN | NaN | NaN |
Dropping entries from Axis
= pd.Series(np.arange(5.), index = ['a', 'b', 'c', 'd', 'e'])
obj
obj
a 0.0
b 1.0
c 2.0
d 3.0
e 4.0
dtype: float64
= obj.drop('c')
new_obj new_obj
a 0.0
b 1.0
d 3.0
e 4.0
dtype: float64
'd', 'e']) obj.drop([
a 0.0
b 1.0
c 2.0
dtype: float64
# in DataFrame
= pd.DataFrame(np.arange(16).reshape((4,4)),
data =['québec', 'montréal', 'toronto', 'sainte-anne'],
index= ['one', 'two', 'three', 'four'])
columns data
one | two | three | four | |
---|---|---|---|---|
québec | 0 | 1 | 2 | 3 |
montréal | 4 | 5 | 6 | 7 |
toronto | 8 | 9 | 10 | 11 |
sainte-anne | 12 | 13 | 14 | 15 |
# using drop method
=['toronto', 'sainte-anne']) data.drop(index
one | two | three | four | |
---|---|---|---|---|
québec | 0 | 1 | 2 | 3 |
montréal | 4 | 5 | 6 | 7 |
# dropping using axis method (axis = 1 = columns)
'two', axis=1) data.drop(
one | three | four | |
---|---|---|---|
québec | 0 | 2 | 3 |
montréal | 4 | 6 | 7 |
toronto | 8 | 10 | 11 |
sainte-anne | 12 | 14 | 15 |
'three', 'four'], axis='columns') data.drop([
one | two | |
---|---|---|
québec | 0 | 1 |
montréal | 4 | 5 |
toronto | 8 | 9 |
sainte-anne | 12 | 13 |
Indexing, Selecting, and Filtering
= pd.Series(np.arange(4.), index= ['a', 'b', 'c', 'd'])
obj
obj
a 0.0
b 1.0
c 2.0
d 3.0
dtype: float64
'b'] obj[
1.0
1] obj[
1.0
2:4] obj[
c 2.0
d 3.0
dtype: float64
<2] obj[obj
a 0.0
b 1.0
dtype: float64
'b', 'c']] obj.loc[[
b 1.0
c 2.0
dtype: float64
= pd.Series([1,2,3], index = [2,0,1])
obj1
= pd.Series([1,2,3], index = ['a', 'b', 'c'])
obj2
obj1
2 1
0 2
1 3
dtype: int64
obj2
a 1
b 2
c 3
dtype: int64
# loc fails as index doesnot contain integers
0, 1]] obj2.loc[[
# fix this
'b':'c'] obj2.loc[
b 2
c 3
dtype: int64
# so, prefer using iloc with integers
0,1,2]] obj1.iloc[[
2 1
0 2
1 3
dtype: int64
0,1,2]] obj2.iloc[[
a 1
b 2
c 3
dtype: int64
# assigning values
'b':'c'] = 5
obj2.loc[
obj2
a 1
b 5
c 5
dtype: int64
data
one | two | three | four | |
---|---|---|---|---|
québec | 0 | 1 | 2 | 3 |
montréal | 4 | 5 | 6 | 7 |
toronto | 8 | 9 | 10 | 11 |
sainte-anne | 12 | 13 | 14 | 15 |
2] data[:
one | two | three | four | |
---|---|---|---|---|
québec | 0 | 1 | 2 | 3 |
montréal | 4 | 5 | 6 | 7 |
# booleans
< 5 data
one | two | three | four | |
---|---|---|---|---|
québec | True | True | True | True |
montréal | True | False | False | False |
toronto | False | False | False | False |
sainte-anne | False | False | False | False |
# assigning values
< 5] = 0
data[data
data
one | two | three | four | |
---|---|---|---|---|
québec | 0 | 0 | 0 | 0 |
montréal | 0 | 5 | 6 | 7 |
toronto | 8 | 9 | 10 | 11 |
sainte-anne | 12 | 13 | 14 | 15 |
selection of DataFrame with loc and iloc
data
one | two | three | four | |
---|---|---|---|---|
québec | 0 | 0 | 0 | 0 |
montréal | 0 | 5 | 6 | 7 |
toronto | 8 | 9 | 10 | 11 |
sainte-anne | 12 | 13 | 14 | 15 |
'montréal'] data.loc[
one 0
two 5
three 6
four 7
Name: montréal, dtype: int32
'montréal', 'québec']] data.loc[[
one | two | three | four | |
---|---|---|---|---|
montréal | 0 | 5 | 6 | 7 |
québec | 0 | 0 | 0 | 0 |
'montréal', ['two', 'three']] data.loc[
two 5
three 6
Name: montréal, dtype: int32
# similar operations with iloc
2] data.iloc[
one 8
two 9
three 10
four 11
Name: toronto, dtype: int32
2,1]] #third row and second row data.iloc[[
one | two | three | four | |
---|---|---|---|---|
toronto | 8 | 9 | 10 | 11 |
montréal | 0 | 5 | 6 | 7 |
2,[3,0,1]] #third row (three elements in order) data.iloc[
four 11
one 8
two 9
Name: toronto, dtype: int32
1,2],[3,0,1]] data.iloc[[
four | one | two | |
---|---|---|---|
montréal | 7 | 0 | 5 |
toronto | 11 | 8 | 9 |
integer indexing pitfalls
= pd.Series(np.arange(3.)) series
series
0 0.0
1 1.0
2 2.0
dtype: float64
# fails here but works fine with iloc and loc
-1]
series[
# value error; key error: -1
-1] series.iloc[
2.0
# non-integer doesnot do this ambiguity
= pd.Series(np.arange(3.0), index = ['a', 'b', 'c'])
series2
-1] series2[
2.0
Pitfalls with chained indexing
'one'] = 1
data.loc[:,
data
one | two | three | four | |
---|---|---|---|---|
québec | 1 | 0 | 0 | 0 |
montréal | 1 | 5 | 6 | 7 |
toronto | 1 | 9 | 10 | 11 |
sainte-anne | 1 | 13 | 14 | 15 |
2] = 5 data.iloc[
data
one | two | three | four | |
---|---|---|---|---|
québec | 1 | 0 | 0 | 0 |
montréal | 1 | 5 | 6 | 7 |
toronto | 5 | 5 | 5 | 5 |
sainte-anne | 1 | 13 | 14 | 15 |
'four'] > 5] = 3
data.loc[data[
data
one | two | three | four | |
---|---|---|---|---|
québec | 1 | 0 | 0 | 0 |
montréal | 3 | 3 | 3 | 3 |
toronto | 5 | 5 | 5 | 5 |
sainte-anne | 3 | 3 | 3 | 3 |
# the data gets modified, but it is not the way that was asked for
# fixing it with loc operation
== 10, "three"] = 9
data.loc[data.three
data
one | two | three | four | |
---|---|---|---|---|
québec | 1 | 0 | 0 | 0 |
montréal | 3 | 3 | 3 | 3 |
toronto | 5 | 5 | 5 | 5 |
sainte-anne | 3 | 3 | 3 | 3 |
Arithmetic and Data Alignment
= pd.Series([7.3, -2.5, 3.4, 1.5], index = ['a', 'c', 'd', 'e'])
s1
= pd.Series([1.2, -3, -.3, -.33, -43.2], index = ['e', 'j', 'o', 't', 'y'])
s2
s1
a 7.3
c -2.5
d 3.4
e 1.5
dtype: float64
s2
e 1.20
j -3.00
o -0.30
t -0.33
y -43.20
dtype: float64
# adding these- missing values donot overlap
+s2 s1
a NaN
c NaN
d NaN
e 2.7
j NaN
o NaN
t NaN
y NaN
dtype: float64
# in case of DataFrame, alignment is performed on both rows and columns
= pd.DataFrame(np.arange(9.).reshape((3,3)),
df1 = list('abc'),
columns = ['ferozpur', 'faridkot', 'montréal'])
index
= pd.DataFrame(np.arange(12.).reshape((4,3)),
df2 = list('abc'),
columns = ['faridkot', 'toronto', 'québec', 'montréal']) index
df1
a | b | c | |
---|---|---|---|
ferozpur | 0.0 | 1.0 | 2.0 |
faridkot | 3.0 | 4.0 | 5.0 |
montréal | 6.0 | 7.0 | 8.0 |
df2
a | b | c | |
---|---|---|---|
faridkot | 0.0 | 1.0 | 2.0 |
toronto | 3.0 | 4.0 | 5.0 |
québec | 6.0 | 7.0 | 8.0 |
montréal | 9.0 | 10.0 | 11.0 |
+ df2 #because the columns were same, it added those numbers df1
a | b | c | |
---|---|---|---|
faridkot | 3.0 | 5.0 | 7.0 |
ferozpur | NaN | NaN | NaN |
montréal | 15.0 | 17.0 | 19.0 |
québec | NaN | NaN | NaN |
toronto | NaN | NaN | NaN |
# changing columns names will give all NAN (null values)
= pd.DataFrame(np.arange(12.).reshape((4,3)),
df3 = list('xyz'),
columns = ['faridkot', 'toronto', 'québec', 'montréal']) index
+ df3 df1
a | b | c | x | y | z | |
---|---|---|---|---|---|---|
faridkot | NaN | NaN | NaN | NaN | NaN | NaN |
ferozpur | NaN | NaN | NaN | NaN | NaN | NaN |
montréal | NaN | NaN | NaN | NaN | NaN | NaN |
québec | NaN | NaN | NaN | NaN | NaN | NaN |
toronto | NaN | NaN | NaN | NaN | NaN | NaN |
Arithmetic methods with fill values
df2
a | b | c | |
---|---|---|---|
faridkot | 0.0 | 1.0 | 2.0 |
toronto | 3.0 | 4.0 | 5.0 |
québec | 6.0 | 7.0 | 8.0 |
montréal | 9.0 | 10.0 | 11.0 |
'faridkot', 'y'] = np.nan df2.loc[
df2
a | b | c | y | |
---|---|---|---|---|
faridkot | 0.0 | 1.0 | 2.0 | NaN |
toronto | 3.0 | 4.0 | 5.0 | NaN |
québec | 6.0 | 7.0 | 8.0 | NaN |
montréal | 9.0 | 10.0 | 11.0 | NaN |
help(pd.DataFrame._drop_axis)
Help on function _drop_axis in module pandas.core.generic:
_drop_axis(self: 'NDFrameT', labels, axis, level=None, errors: 'IgnoreRaise' = 'raise', only_slice: 'bool_t' = False) -> 'NDFrameT'
Drop labels from specified axis. Used in the ``drop`` method
internally.
Parameters
----------
labels : single label or list-like
axis : int or axis name
level : int or level name, default None
For MultiIndex
errors : {'ignore', 'raise'}, default 'raise'
If 'ignore', suppress error and existing labels are dropped.
only_slice : bool, default False
Whether indexing along columns should be view-only.
help(pd.DataFrame.drop)
print(dir(DataFrame))
['T', '_AXIS_LEN', '_AXIS_ORDERS', '_AXIS_TO_AXIS_NUMBER', '_HANDLED_TYPES', '__abs__', '__add__', '__and__', '__annotations__', '__array__', '__array_priority__', '__array_ufunc__', '__bool__', '__class__', '__contains__', '__copy__', '__dataframe__', '__deepcopy__', '__delattr__', '__delitem__', '__dict__', '__dir__', '__divmod__', '__doc__', '__eq__', '__finalize__', '__floordiv__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__iadd__', '__iand__', '__ifloordiv__', '__imod__', '__imul__', '__init__', '__init_subclass__', '__invert__', '__ior__', '__ipow__', '__isub__', '__iter__', '__itruediv__', '__ixor__', '__le__', '__len__', '__lt__', '__matmul__', '__mod__', '__module__', '__mul__', '__ne__', '__neg__', '__new__', '__nonzero__', '__or__', '__pos__', '__pow__', '__radd__', '__rand__', '__rdivmod__', '__reduce__', '__reduce_ex__', '__repr__', '__rfloordiv__', '__rmatmul__', '__rmod__', '__rmul__', '__ror__', '__round__', '__rpow__', '__rsub__', '__rtruediv__', '__rxor__', '__setattr__', '__setitem__', '__setstate__', '__sizeof__', '__str__', '__sub__', '__subclasshook__', '__truediv__', '__weakref__', '__xor__', '_accessors', '_accum_func', '_add_numeric_operations', '_agg_examples_doc', '_agg_summary_and_see_also_doc', '_align_frame', '_align_series', '_append', '_arith_method', '_as_manager', '_box_col_values', '_can_fast_transpose', '_check_inplace_and_allows_duplicate_labels', '_check_inplace_setting', '_check_is_chained_assignment_possible', '_check_label_or_level_ambiguity', '_check_setitem_copy', '_clear_item_cache', '_clip_with_one_bound', '_clip_with_scalar', '_cmp_method', '_combine_frame', '_consolidate', '_consolidate_inplace', '_construct_axes_dict', '_construct_result', '_constructor', '_constructor_sliced', '_create_data_for_split_and_tight_to_dict', '_data', '_dir_additions', '_dir_deletions', '_dispatch_frame_op', '_drop_axis', '_drop_labels_or_levels', '_ensure_valid_index', '_find_valid_index', '_from_arrays', '_get_agg_axis', '_get_axis', '_get_axis_name', '_get_axis_number', '_get_axis_resolvers', '_get_block_manager_axis', '_get_bool_data', '_get_cleaned_column_resolvers', '_get_column_array', '_get_index_resolvers', '_get_item_cache', '_get_label_or_level_values', '_get_numeric_data', '_get_value', '_getitem_bool_array', '_getitem_multilevel', '_getitem_nocopy', '_gotitem', '_hidden_attrs', '_indexed_same', '_info_axis', '_info_axis_name', '_info_axis_number', '_info_repr', '_init_mgr', '_inplace_method', '_internal_names', '_internal_names_set', '_is_copy', '_is_homogeneous_type', '_is_label_or_level_reference', '_is_label_reference', '_is_level_reference', '_is_mixed_type', '_is_view', '_iset_item', '_iset_item_mgr', '_iset_not_inplace', '_iter_column_arrays', '_ixs', '_join_compat', '_logical_func', '_logical_method', '_maybe_cache_changed', '_maybe_update_cacher', '_metadata', '_min_count_stat_function', '_needs_reindex_multi', '_protect_consolidate', '_reduce', '_reduce_axis1', '_reindex_axes', '_reindex_columns', '_reindex_index', '_reindex_multi', '_reindex_with_indexers', '_rename', '_replace_columnwise', '_repr_data_resource_', '_repr_fits_horizontal_', '_repr_fits_vertical_', '_repr_html_', '_repr_latex_', '_reset_cache', '_reset_cacher', '_sanitize_column', '_series', '_set_axis', '_set_axis_name', '_set_axis_nocheck', '_set_is_copy', '_set_item', '_set_item_frame_value', '_set_item_mgr', '_set_value', '_setitem_array', '_setitem_frame', '_setitem_slice', '_slice', '_stat_axis', '_stat_axis_name', '_stat_axis_number', '_stat_function', '_stat_function_ddof', '_take', '_take_with_is_copy', '_to_dict_of_blocks', '_to_latex_via_styler', '_typ', '_update_inplace', '_validate_dtype', '_values', '_where', 'abs', 'add', 'add_prefix', 'add_suffix', 'agg', 'aggregate', 'align', 'all', 'any', 'apply', 'applymap', 'asfreq', 'asof', 'assign', 'astype', 'at', 'at_time', 'attrs', 'axes', 'backfill', 'between_time', 'bfill', 'bool', 'boxplot', 'clip', 'columns', 'combine', 'combine_first', 'compare', 'convert_dtypes', 'copy', 'corr', 'corrwith', 'count', 'cov', 'cummax', 'cummin', 'cumprod', 'cumsum', 'describe', 'diff', 'div', 'divide', 'dot', 'drop', 'drop_duplicates', 'droplevel', 'dropna', 'dtypes', 'duplicated', 'empty', 'eq', 'equals', 'eval', 'ewm', 'expanding', 'explode', 'ffill', 'fillna', 'filter', 'first', 'first_valid_index', 'flags', 'floordiv', 'from_dict', 'from_records', 'ge', 'get', 'groupby', 'gt', 'head', 'hist', 'iat', 'idxmax', 'idxmin', 'iloc', 'index', 'infer_objects', 'info', 'insert', 'interpolate', 'isetitem', 'isin', 'isna', 'isnull', 'items', 'iterrows', 'itertuples', 'join', 'keys', 'kurt', 'kurtosis', 'last', 'last_valid_index', 'le', 'loc', 'lt', 'mask', 'max', 'mean', 'median', 'melt', 'memory_usage', 'merge', 'min', 'mod', 'mode', 'mul', 'multiply', 'ndim', 'ne', 'nlargest', 'notna', 'notnull', 'nsmallest', 'nunique', 'pad', 'pct_change', 'pipe', 'pivot', 'pivot_table', 'plot', 'pop', 'pow', 'prod', 'product', 'quantile', 'query', 'radd', 'rank', 'rdiv', 'reindex', 'reindex_like', 'rename', 'rename_axis', 'reorder_levels', 'replace', 'resample', 'reset_index', 'rfloordiv', 'rmod', 'rmul', 'rolling', 'round', 'rpow', 'rsub', 'rtruediv', 'sample', 'select_dtypes', 'sem', 'set_axis', 'set_flags', 'set_index', 'shape', 'shift', 'size', 'skew', 'sort_index', 'sort_values', 'sparse', 'squeeze', 'stack', 'std', 'style', 'sub', 'subtract', 'sum', 'swapaxes', 'swaplevel', 'tail', 'take', 'to_clipboard', 'to_csv', 'to_dict', 'to_excel', 'to_feather', 'to_gbq', 'to_hdf', 'to_html', 'to_json', 'to_latex', 'to_markdown', 'to_numpy', 'to_orc', 'to_parquet', 'to_period', 'to_pickle', 'to_records', 'to_sql', 'to_stata', 'to_string', 'to_timestamp', 'to_xarray', 'to_xml', 'transform', 'transpose', 'truediv', 'truncate', 'tz_convert', 'tz_localize', 'unstack', 'update', 'value_counts', 'values', 'var', 'where', 'xs']
help(pd.DataFrame.describe)
help(pd.DataFrame._drop_axis)
= df2 df4
df4
a | b | c | y | |
---|---|---|---|---|
faridkot | 0.0 | 1.0 | 2.0 | NaN |
toronto | 3.0 | 4.0 | 5.0 | NaN |
québec | 6.0 | 7.0 | 8.0 | NaN |
montréal | 9.0 | 10.0 | 11.0 | NaN |
+ df4 df1
a | b | c | y | |
---|---|---|---|---|
faridkot | 3.0 | 5.0 | 7.0 | NaN |
ferozpur | NaN | NaN | NaN | NaN |
montréal | 15.0 | 17.0 | 19.0 | NaN |
québec | NaN | NaN | NaN | NaN |
toronto | NaN | NaN | NaN | NaN |
= 0
df4.fill_value
df4
a | b | c | y | |
---|---|---|---|---|
faridkot | 0.0 | 1.0 | 2.0 | NaN |
toronto | 3.0 | 4.0 | 5.0 | NaN |
québec | 6.0 | 7.0 | 8.0 | NaN |
montréal | 9.0 | 10.0 | 11.0 | NaN |
1/df4
a | b | c | y | |
---|---|---|---|---|
faridkot | inf | 1.000000 | 0.500000 | NaN |
toronto | 0.333333 | 0.250000 | 0.200000 | NaN |
québec | 0.166667 | 0.142857 | 0.125000 | NaN |
montréal | 0.111111 | 0.100000 | 0.090909 | NaN |
1) df4.rdiv(
a | b | c | y | |
---|---|---|---|---|
faridkot | inf | 1.000000 | 0.500000 | NaN |
toronto | 0.333333 | 0.250000 | 0.200000 | NaN |
québec | 0.166667 | 0.142857 | 0.125000 | NaN |
montréal | 0.111111 | 0.100000 | 0.090909 | NaN |
= df4.columns, fill_value=0) # not working df4.reindex(columns
a | b | c | y | |
---|---|---|---|---|
faridkot | 0.0 | 1.0 | 2.0 | NaN |
toronto | 3.0 | 4.0 | 5.0 | NaN |
québec | 6.0 | 7.0 | 8.0 | NaN |
montréal | 9.0 | 10.0 | 11.0 | NaN |
Operations between DataFrame and Series
= np.arange(12.).reshape((3,4))
arr
arr
array([[ 0., 1., 2., 3.],
[ 4., 5., 6., 7.],
[ 8., 9., 10., 11.]])
0] arr[
array([0., 1., 2., 3.])
# broadcasting
- arr[0] #subtracts from all rows arr
array([[0., 0., 0., 0.],
[4., 4., 4., 4.],
[8., 8., 8., 8.]])
frame
ohio | texas | burmingham | |
---|---|---|---|
a | 0 | 1 | 2 |
b | 3 | 4 | 5 |
c | 6 | 7 | 8 |
help(pd.Series)
series
= pd.Series(data = np.arange(3), index = ['a', 'b', 'c'])
series1
series1
a 0
b 1
c 2
dtype: int32
-series1 frame
a | b | burmingham | c | ohio | texas | |
---|---|---|---|---|---|---|
a | NaN | NaN | NaN | NaN | NaN | NaN |
b | NaN | NaN | NaN | NaN | NaN | NaN |
c | NaN | NaN | NaN | NaN | NaN | NaN |
+ series2 frame
a | b | burmingham | c | ohio | texas | |
---|---|---|---|---|---|---|
a | NaN | NaN | NaN | NaN | NaN | NaN |
b | NaN | NaN | NaN | NaN | NaN | NaN |
c | NaN | NaN | NaN | NaN | NaN | NaN |
Function application and ‘mapping’
= pd.DataFrame(np.random.standard_normal((4,3)),
frame2 = list('bde'),
columns = ['utah', 'faridkot', 'shahkot', 'malsahian'])
index frame2
b | d | e | |
---|---|---|---|
utah | -1.549165 | 0.443756 | 1.013167 |
faridkot | 1.130587 | -1.289388 | -1.210530 |
shahkot | 1.195553 | 0.274397 | 0.510043 |
malsahian | 0.713024 | -1.223282 | 1.857681 |
abs(frame2) #converts non-negative values to positive np.
b | d | e | |
---|---|---|---|
utah | 1.549165 | 0.443756 | 1.013167 |
faridkot | 1.130587 | 1.289388 | 1.210530 |
shahkot | 1.195553 | 0.274397 | 0.510043 |
malsahian | 0.713024 | 1.223282 | 1.857681 |
help(np.abs)
Help on ufunc:
absolute = <ufunc 'absolute'>
absolute(x, /, out=None, *, where=True, casting='same_kind', order='K', dtype=None, subok=True[, signature, extobj])
Calculate the absolute value element-wise.
``np.abs`` is a shorthand for this function.
Parameters
----------
x : array_like
Input array.
out : ndarray, None, or tuple of ndarray and None, optional
A location into which the result is stored. If provided, it must have
a shape that the inputs broadcast to. If not provided or None,
a freshly-allocated array is returned. A tuple (possible only as a
keyword argument) must have length equal to the number of outputs.
where : array_like, optional
This condition is broadcast over the input. At locations where the
condition is True, the `out` array will be set to the ufunc result.
Elsewhere, the `out` array will retain its original value.
Note that if an uninitialized `out` array is created via the default
``out=None``, locations within it where the condition is False will
remain uninitialized.
**kwargs
For other keyword-only arguments, see the
:ref:`ufunc docs <ufuncs.kwargs>`.
Returns
-------
absolute : ndarray
An ndarray containing the absolute value of
each element in `x`. For complex input, ``a + ib``, the
absolute value is :math:`\sqrt{ a^2 + b^2 }`.
This is a scalar if `x` is a scalar.
Examples
--------
>>> x = np.array([-1.2, 1.2])
>>> np.absolute(x)
array([ 1.2, 1.2])
>>> np.absolute(1.2 + 1j)
1.5620499351813308
Plot the function over ``[-10, 10]``:
>>> import matplotlib.pyplot as plt
>>> x = np.linspace(start=-10, stop=10, num=101)
>>> plt.plot(x, np.absolute(x))
>>> plt.show()
Plot the function over the complex plane:
>>> xx = x + 1j * x[:, np.newaxis]
>>> plt.imshow(np.abs(xx), extent=[-10, 10, -10, 10], cmap='gray')
>>> plt.show()
The `abs` function can be used as a shorthand for ``np.absolute`` on
ndarrays.
>>> x = np.array([-1.2, 1.2])
>>> abs(x)
array([1.2, 1.2])
#DataFrame's apply method
def f1(x):
return x.max() - x.min()
apply(f1) frame2.
b 2.744718
d 1.733145
e 3.068212
dtype: float64
# applying across columns
apply(f1, axis = 'columns') frame2.
utah 2.562332
faridkot 2.419976
shahkot 0.921156
malsahian 3.080963
dtype: float64
# modifying the function to return Series with multiple values
def f2(x):
return pd.Series([x.min(), x.max()], index= ['min', 'max'])
apply(f2) frame.
ohio | texas | burmingham | |
---|---|---|---|
min | 0 | 1 | 2 |
max | 6 | 7 | 8 |
frame
ohio | texas | burmingham | |
---|---|---|---|
a | 0 | 1 | 2 |
b | 3 | 4 | 5 |
c | 6 | 7 | 8 |
apply(f2) frame2.
b | d | e | |
---|---|---|---|
min | -1.549165 | -1.289388 | -1.210530 |
max | 1.195553 | 0.443756 | 1.857681 |
# apply map function
def my_format(x):
return f"{x:.2f}"
frame2.applymap(my_format)
b | d | e | |
---|---|---|---|
utah | -1.55 | 0.44 | 1.01 |
faridkot | 1.13 | -1.29 | -1.21 |
shahkot | 1.20 | 0.27 | 0.51 |
malsahian | 0.71 | -1.22 | 1.86 |
# applying map function in Series
'e'].map(my_format) frame2[
utah 1.01
faridkot -1.21
shahkot 0.51
malsahian 1.86
Name: e, dtype: object
Sorting and Ranking
= pd.Series(np.arange(4), index = ['d', 'a', 'b', 'c'])
obj2
obj2
d 0
a 1
b 2
c 3
dtype: int32
obj2.sort_index()
a 1
b 2
c 3
d 0
dtype: int32
# sorting in DataFrame can be done with either axis
= pd.DataFrame(np.arange(8).reshape((2, 4)),
frame = ['three', 'one'],
index = ['d', 'a', 'b', 'c'])
columns
frame
d | a | b | c | |
---|---|---|---|---|
three | 0 | 1 | 2 | 3 |
one | 4 | 5 | 6 | 7 |
frame.sort_index()
d | a | b | c | |
---|---|---|---|---|
one | 4 | 5 | 6 | 7 |
three | 0 | 1 | 2 | 3 |
= 'columns') frame.sort_index(axis
a | b | c | d | |
---|---|---|---|---|
three | 1 | 2 | 3 | 0 |
one | 5 | 6 | 7 | 4 |
# data can be stored in descending order aswell
= 'columns', ascending = False) frame.sort_index(axis
d | c | b | a | |
---|---|---|---|---|
three | 0 | 3 | 2 | 1 |
one | 4 | 7 | 6 | 5 |
# sorting a series by its values
= pd.Series([4, 7, -3, -2])
obj
obj.sort_values()
2 -3
3 -2
0 4
1 7
dtype: int64
# missing values get sorted to the end by default
= pd.Series([4, 3, 4, np.nan, 33, np.nan, -3, 3])
obj
obj.sort_values()
6 -3.0
1 3.0
7 3.0
0 4.0
2 4.0
4 33.0
3 NaN
5 NaN
dtype: float64
# using na_position to bring missing values to the front
= 'first') obj.sort_values(na_position
3 NaN
5 NaN
6 -3.0
1 3.0
7 3.0
0 4.0
2 4.0
4 33.0
dtype: float64
# while sorting a DataFrame
= pd.DataFrame({'b': [1, 2, 3, 4, 5], 'a':[3, 43, 33, 1, 5]})
frame
frame
b | a | |
---|---|---|
0 | 1 | 3 |
1 | 2 | 43 |
2 | 3 | 33 |
3 | 4 | 1 |
4 | 5 | 5 |
'a') frame.sort_values(
b | a | |
---|---|---|
3 | 4 | 1 |
0 | 1 | 3 |
4 | 5 | 5 |
2 | 3 | 33 |
1 | 2 | 43 |
# ranking
= pd.Series([4, 5, -5, 7, 8, 0, 4])
obj
obj.rank()
0 3.5
1 5.0
2 1.0
3 6.0
4 7.0
5 2.0
6 3.5
dtype: float64
# ranking in order the data is observed
='first') obj.rank(method
0 3.0
1 5.0
2 1.0
3 6.0
4 7.0
5 2.0
6 4.0
dtype: float64
= False) obj.rank(ascending
0 4.5
1 3.0
2 7.0
3 2.0
4 1.0
5 6.0
6 4.5
dtype: float64
# DataFrame for rank computation
frame
b | a | |
---|---|---|
0 | 1 | 3 |
1 | 2 | 43 |
2 | 3 | 33 |
3 | 4 | 1 |
4 | 5 | 5 |
= 'columns') frame.rank(axis
b | a | |
---|---|---|
0 | 1.0 | 2.0 |
1 | 1.0 | 2.0 |
2 | 1.0 | 2.0 |
3 | 2.0 | 1.0 |
4 | 1.5 | 1.5 |
Axis indices with duplicate labels
= pd.Series(np.arange(5), index=['a', 'a', 'b','b', 'c' ])
obj
obj
a 0
a 1
b 2
b 3
c 4
dtype: int32
obj.index.is_unique
False
'a'] obj[
a 0
a 1
dtype: int32
'c'] obj[
4
# DataFrame
= pd.DataFrame(np.random.standard_normal((5, 3)),
df = ['a', 'a', 'b', 'c', 'b'])
index
df
0 | 1 | 2 | |
---|---|---|---|
a | -1.817751 | 0.915854 | -0.389590 |
a | 0.603020 | 0.573012 | 1.070691 |
b | -0.903033 | 1.109707 | 0.874381 |
c | 2.529357 | -1.169854 | 0.676702 |
b | -0.368763 | 0.723758 | 0.375079 |
'b'] df.loc[
0 | 1 | 2 | |
---|---|---|---|
b | -0.903033 | 1.109707 | 0.874381 |
b | -0.368763 | 0.723758 | 0.375079 |
'c'] df.loc[
0 2.529357
1 -1.169854
2 0.676702
Name: c, dtype: float64
Descriptive statistics
sum() df.
0 0.042830
1 2.152477
2 2.607263
dtype: float64
sum(axis = 'columns') df.
a -1.291487
a 2.246723
b 1.081055
c 2.036205
b 0.730074
dtype: float64
df.describe()
0 | 1 | 2 | |
---|---|---|---|
count | 5.000000 | 5.000000 | 5.000000 |
mean | 0.008566 | 0.430495 | 0.521453 |
std | 1.659562 | 0.917106 | 0.570471 |
min | -1.817751 | -1.169854 | -0.389590 |
25% | -0.903033 | 0.573012 | 0.375079 |
50% | -0.368763 | 0.723758 | 0.676702 |
75% | 0.603020 | 0.915854 | 0.874381 |
max | 2.529357 | 1.109707 | 1.070691 |
df.cumsum()
0 | 1 | 2 | |
---|---|---|---|
a | -1.817751 | 0.915854 | -0.389590 |
a | -1.214731 | 1.488866 | 0.681101 |
b | -2.117763 | 2.598573 | 1.555481 |
c | 0.411594 | 1.428719 | 2.232183 |
b | 0.042830 | 2.152477 | 2.607263 |
Unique Values, Value counts, and Membership
= pd.Series(['c', 'd', 'a', 'b', 'n', 'm', 'g', "k", 'b', 'c', 'k'])
obj
= obj.unique()
uniques
uniques
array(['c', 'd', 'a', 'b', 'n', 'm', 'g', 'k'], dtype=object)
obj.value_counts()
c 2
b 2
k 2
d 1
a 1
n 1
m 1
g 1
Name: count, dtype: int64
= False) pd.value_counts(obj.to_numpy(), sort
c 2
d 1
a 1
b 2
n 1
m 1
g 1
k 2
Name: count, dtype: int64
# 'isin' is used for vectorized set memebership
obj
0 c
1 d
2 a
3 b
4 n
5 m
6 g
7 k
8 b
9 c
10 k
dtype: object
= obj.isin(['b', 'c'])
mask
mask
0 True
1 False
2 False
3 True
4 False
5 False
6 False
7 False
8 True
9 True
10 False
dtype: bool
obj[mask]
0 c
3 b
8 b
9 c
dtype: object
= pd.Series(['c', 'a', 'b', 'b', 'c'])
to_match
= pd.Series(['c', 'b', 'a'])
unique_vals
= pd.Index(unique_vals).get_indexer(to_match)
indices
indices
array([0, 2, 1, 1, 0], dtype=int64)
= pd.DataFrame({'ça va': [1, 2, 3, 4, 5],
data 'oui, ça va' : [43,3, 2, 4, 2],
'et toi': [3, 2, 44, 1, 5]})
data
ça va | oui, ça va | et toi | |
---|---|---|---|
0 | 1 | 43 | 3 |
1 | 2 | 3 | 2 |
2 | 3 | 2 | 44 |
3 | 4 | 4 | 1 |
4 | 5 | 2 | 5 |
# computing value counts
'ça va'].value_counts().sort_index() data[
ça va
1 1
2 1
3 1
4 1
5 1
Name: count, dtype: int64
'et toi'].value_counts().sort_index() data[
et toi
1 1
2 1
3 1
5 1
44 1
Name: count, dtype: int64
= data.apply(pd.value_counts).fillna(0)
result
result
ça va | oui, ça va | et toi | |
---|---|---|---|
1 | 1.0 | 0.0 | 1.0 |
2 | 1.0 | 2.0 | 1.0 |
3 | 1.0 | 1.0 | 1.0 |
4 | 1.0 | 1.0 | 0.0 |
5 | 1.0 | 0.0 | 1.0 |
43 | 0.0 | 1.0 | 0.0 |
44 | 0.0 | 0.0 | 1.0 |