helps in numerical computing (NumPy, SciPy)
helps with analytical libraries (scikit-learn, and data visualizatioon,
processes data without for loops

Data structures

Series
Data Frames
index objects

Functionality

Reindexing
Dropping entreies from axis
indexing, selection, and filtering
DataFrame selection with loc and iloc
integer indexing pitfalls
pitfalls with chained indexing
artihmetic and data alignment
arithmetic methods with fill values
Operations between DataFrame and Series
Function applicaiton and mapping
Sorting and Ranking
Axis indexed with dupicate labels # Summarizing and Descriptive statistics
correlation and variance
unique values, counts, and memberships

Series

import pandas as pd

import numpy as np

from pandas import Series, DataFrame

obj = pd.Series([4,2, 312, -3])

obj

0      4
1      2
2    312
3     -3
dtype: int64

obj2 = pd.Series([4,2, 312, -3], index = ['a', 'b', 'c', 'd'])

obj2

a      4
b      2
c    312
d     -3
dtype: int64

obj2.index

Index(['a', 'b', 'c', 'd'], dtype='object')

obj2[obj2 > 0]

a      4
b      2
c    312
dtype: int64

np.exp(obj2)

a     5.459815e+01
b     7.389056e+00
c    3.161392e+135
d     4.978707e-02
dtype: float64

'b' in obj2

True

'e' in obj2

False

sdata = {'ohio': 232, 'Texas': 332, 'Oregon': 34343}

obj3 = pd.Series(sdata)

obj3

ohio        232
Texas       332
Oregon    34343
dtype: int64

obj3.to_dict()

{'ohio': 232, 'Texas': 332, 'Oregon': 34343}

states = ['California', 'ohio', 'orgeon']

obj4 = pd.Series(sdata, index = states)

obj4

California      NaN
ohio          232.0
orgeon          NaN
dtype: float64

pd.isna(obj4) # is null

California     True
ohio          False
orgeon         True
dtype: bool

pd.notna(obj4)  #not null

California    False
ohio           True
orgeon        False
dtype: bool

obj3 + obj4

California      NaN
Oregon          NaN
Texas           NaN
ohio          464.0
orgeon          NaN
dtype: float64

obj4.name = 'population'

obj4.index.name = 'state'

obj4

state
California      NaN
ohio          232.0
orgeon          NaN
Name: population, dtype: float64

obj

0      4
1      2
2    312
3     -3
dtype: int64

# altering the index in place

obj.index = ['Kunal', 'Rahul', 'Raghav', 'Ryan']

obj

Kunal       4
Rahul       2
Raghav    312
Ryan       -3
dtype: int64

DataFrame

data = {'state': ['ohio', 'ohio', 'nevada',], 
       'year': [2000, 2001, 2002,], 
       'pop': [1.2, 1.3, 1.4,]}

frame = pd.DataFrame(data)

frame

	state	year	pop
0	ohio	2000	1.2
1	ohio	2001	1.3
2	nevada	2002	1.4

frame.head()

	state	year	pop
0	ohio	2000	1.2
1	ohio	2001	1.3
2	nevada	2002	1.4

frame.tail()

	state	year	pop
0	ohio	2000	1.2
1	ohio	2001	1.3
2	nevada	2002	1.4

# passing another column in the dataframe

frame2 = pd.DataFrame(data, columns = ['state', 'year', 'pop', 'debt'])

frame2

	state	year	pop	debt
0	ohio	2000	1.2	NaN
1	ohio	2001	1.3	NaN
2	nevada	2002	1.4	NaN

# changing the order of columns


frame2 = pd.DataFrame(data, columns = [ 'year', 'pop', 'debt', 'state'])

frame2

	state	year	pop	debt
0	ohio	2000	1.2	NaN
1	ohio	2001	1.3	NaN
2	nevada	2002	1.4	NaN

frame2.year

0    2000
1    2001
2    2002
Name: year, dtype: int64

frame2.loc[1]

year     2001
pop       1.3
debt      NaN
state    ohio
Name: 1, dtype: object

frame2.iloc[2]

year       2002
pop         1.4
debt        NaN
state    nevada
Name: 2, dtype: object

frame2.pop

<bound method DataFrame.pop of    year  pop debt   state
0  2000  1.2  NaN    ohio
1  2001  1.3  NaN    ohio
2  2002  1.4  NaN  nevada>

frame2.year

0    2000
1    2001
2    2002
Name: year, dtype: int64

# assigning values

frame2['debt'] = 14.5

frame2

	year	pop	debt	state
0	2000	1.2	14.5	ohio
1	2001	1.3	14.5	ohio
2	2002	1.4	14.5	nevada

# assiging a new column (resuls in new column if it does not exist before)

frame2['eastern'] = frame2['state']  =='ohio'

frame2

	year	pop	debt	state	eastern
0	2000	1.2	14.5	ohio	True
1	2001	1.3	14.5	ohio	True
2	2002	1.4	14.5	nevada	False

# transposing
frame2.T

	0	1	2
year	2000	2001	2002
pop	1.2	1.3	1.4
debt	14.5	14.5	14.5
state	ohio	ohio	nevada
eastern	True	True	False

pd.DataFrame(data)

	state	year	pop
0	ohio	2000	1.2
1	ohio	2001	1.3
2	nevada	2002	1.4

frame2.index.name = 'year'

frame2.columns.name = 'state'  # starts with state column

frame2

state	year	pop	debt	state	eastern
year
0	2000	1.2	14.5	ohio	True
1	2001	1.3	14.5	ohio	True
2	2002	1.4	14.5	nevada	False

frame2.to_numpy()

array([[2000, 1.2, 14.5, 'ohio', True],
       [2001, 1.3, 14.5, 'ohio', True],
       [2002, 1.4, 14.5, 'nevada', False]], dtype=object)

index objects

obj4 = pd.Series(np.arange(3), index = ['a', 'b', 'c'])

index = obj4.index


index

Index(['a', 'b', 'c'], dtype='object')

index [1:]

Index(['b', 'c'], dtype='object')

# index objects are immutable

index[1]= 'd' #type error

labels = pd.Index(np.arange(3))

labels

Index([0, 1, 2], dtype='int32')

obj2 = pd.Series([1.5, -2.5, 0], index = labels)


obj2

0    1.5
1   -2.5
2    0.0
dtype: float64

obj2.index is labels

True

frame2

state	year	pop	debt	state	eastern
year
0	2000	1.2	14.5	ohio	True
1	2001	1.3	14.5	ohio	True
2	2002	1.4	14.5	nevada	False

frame2.columns

Index(['year', 'pop', 'debt', 'state', 'eastern'], dtype='object', name='state')

2003 in frame2.index

False

# unlike python, a pandas index can contain duplicate labels

pd.Index (['foo', 'boo', 'bar', 'baa', 'etc', 'foo'])

Index(['foo', 'boo', 'bar', 'baa', 'etc', 'foo'], dtype='object')

Reindexing

obj = pd.Series([4.5,48, -3,2,3.9], index= ['a', 'b', 'c', 'd', 'e'])

obj

a     4.5
b    48.0
c    -3.0
d     2.0
e     3.9
dtype: float64

# reindexing
obj2 = obj.reindex(['b', 'a', 'c', 'd', 'e'])

obj2

b    48.0
a     4.5
c    -3.0
d     2.0
e     3.9
dtype: float64

# time series data fill
obj3 = pd.Series(['blue', 'purple', 'yellow'], index = [0, 2, 4])

obj3

0      blue
2    purple
4    yellow
dtype: object

# forward filling the values using ffill
obj3.reindex(np.arange(6), method='ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

# backward fill
obj3.reindex(np.arange(6), method = 'bfill')

0      blue
1    purple
2    purple
3    yellow
4    yellow
5       NaN
dtype: object

frame = pd.DataFrame(np.arange(9).reshape((3, 3)),
                    index = ['a', 'b', 'c'],
                    columns= ['ohio', 'texas', 'burmingham'])

frame

	ohio	texas	burmingham
a	0	1	2
b	3	4	5
c	6	7	8

frame2 = frame.reindex(index=['a', 'b', 'c', 'd'])

frame2

	ohio	texas	burmingham
a	0.0	1.0	2.0
b	3.0	4.0	5.0
c	6.0	7.0	8.0
d	NaN	NaN	NaN

# reindexing columns with column keyword

states = ['london', 'texus', 'surrey']

frame.reindex(columns = states)

	london	texus	surrey
a	NaN	NaN	NaN
b	NaN	NaN	NaN
c	NaN	NaN	NaN

Dropping entries from Axis

obj = pd.Series(np.arange(5.), index = ['a', 'b', 'c', 'd', 'e'])

obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

new_obj = obj.drop('c')
new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

obj.drop(['d', 'e'])

a    0.0
b    1.0
c    2.0
dtype: float64

# in DataFrame
data = pd.DataFrame(np.arange(16).reshape((4,4)),
                    index=['québec', 'montréal', 'toronto', 'sainte-anne'],
                    columns = ['one', 'two', 'three', 'four'])
data

	one	two	three	four
québec	0	1	2	3
montréal	4	5	6	7
toronto	8	9	10	11
sainte-anne	12	13	14	15

# using drop method
data.drop(index=['toronto', 'sainte-anne'])

	one	two	three	four
québec	0	1	2	3
montréal	4	5	6	7

# dropping using axis method (axis = 1 = columns)

data.drop('two', axis=1)

	one	three	four
québec	0	2	3
montréal	4	6	7
toronto	8	10	11
sainte-anne	12	14	15

data.drop(['three', 'four'], axis='columns')

	one	two
québec	0	1
montréal	4	5
toronto	8	9
sainte-anne	12	13

Indexing, Selecting, and Filtering

obj = pd.Series(np.arange(4.), index= ['a', 'b', 'c', 'd'])

obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

obj['b']

1.0

obj[1]

1.0

obj[2:4]

c    2.0
d    3.0
dtype: float64

obj[obj<2]

a    0.0
b    1.0
dtype: float64

obj.loc[['b', 'c']]

b    1.0
c    2.0
dtype: float64

obj1 = pd.Series([1,2,3], index = [2,0,1])

obj2 = pd.Series([1,2,3], index = ['a', 'b', 'c'])

obj1

2    1
0    2
1    3
dtype: int64

obj2

a    1
b    2
c    3
dtype: int64

# loc fails as index doesnot contain integers
obj2.loc[[0, 1]]

# fix this

obj2.loc['b':'c']

b    2
c    3
dtype: int64

# so, prefer using iloc with integers

obj1.iloc[[0,1,2]]

2    1
0    2
1    3
dtype: int64

obj2.iloc[[0,1,2]]

a    1
b    2
c    3
dtype: int64

# assigning values

obj2.loc['b':'c'] = 5

obj2

a    1
b    5
c    5
dtype: int64

data

	one	two	three	four
québec	0	1	2	3
montréal	4	5	6	7
toronto	8	9	10	11
sainte-anne	12	13	14	15

data[:2]

	one	two	three	four
québec	0	1	2	3
montréal	4	5	6	7

# booleans
data < 5

	one	two	three	four
québec	True	True	True	True
montréal	True	False	False	False
toronto	False	False	False	False
sainte-anne	False	False	False	False

# assigning values
data[data < 5] = 0

data

	one	two	three	four
québec	0	0	0	0
montréal	0	5	6	7
toronto	8	9	10	11
sainte-anne	12	13	14	15

selection of DataFrame with loc and iloc



data

	one	two	three	four
québec	0	0	0	0
montréal	0	5	6	7
toronto	8	9	10	11
sainte-anne	12	13	14	15

data.loc['montréal']

one      0
two      5
three    6
four     7
Name: montréal, dtype: int32

data.loc[['montréal', 'québec']]

	one	two	three	four
montréal	0	5	6	7
québec	0	0	0	0

data.loc['montréal', ['two', 'three']]

two      5
three    6
Name: montréal, dtype: int32

# similar operations with iloc
data.iloc[2]

one       8
two       9
three    10
four     11
Name: toronto, dtype: int32

data.iloc[[2,1]]  #third row and second row

	one	two	three	four
toronto	8	9	10	11
montréal	0	5	6	7

data.iloc[2,[3,0,1]] #third row (three elements in order)

four    11
one      8
two      9
Name: toronto, dtype: int32

data.iloc[[1,2],[3,0,1]]

	four	one	two
montréal	7	0	5
toronto	11	8	9

integer indexing pitfalls

series = pd.Series(np.arange(3.))

series

0    0.0
1    1.0
2    2.0
dtype: float64

# fails here but works fine with iloc and loc 
series[-1]



# value error; key error: -1

series.iloc[-1]

2.0

# non-integer doesnot do this ambiguity

series2 = pd.Series(np.arange(3.0), index = ['a', 'b', 'c'])

series2[-1]

2.0

Pitfalls with chained indexing

data.loc[:, 'one'] = 1

data

	one	two	three	four
québec	1	0	0	0
montréal	1	5	6	7
toronto	1	9	10	11
sainte-anne	1	13	14	15

data.iloc[2] = 5

data

	one	two	three	four
québec	1	0	0	0
montréal	1	5	6	7
toronto	5	5	5	5
sainte-anne	1	13	14	15

data.loc[data['four'] > 5]  = 3

data

	one	two	three	four
québec	1	0	0	0
montréal	3	3	3	3
toronto	5	5	5	5
sainte-anne	3	3	3	3

# the data gets modified, but it is not the way that was asked for

# fixing it with loc operation

data.loc[data.three == 10, "three"] = 9

data

	one	two	three	four
québec	1	0	0	0
montréal	3	3	3	3
toronto	5	5	5	5
sainte-anne	3	3	3	3

Arithmetic and Data Alignment

s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index = ['a', 'c', 'd', 'e'])

s2 = pd.Series([1.2, -3, -.3, -.33, -43.2], index = ['e', 'j', 'o', 't', 'y'])

s1

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64

s2

e     1.20
j    -3.00
o    -0.30
t    -0.33
y   -43.20
dtype: float64

# adding these- missing values donot overlap

s1+s2

a    NaN
c    NaN
d    NaN
e    2.7
j    NaN
o    NaN
t    NaN
y    NaN
dtype: float64

# in case of DataFrame, alignment is performed on both rows and columns

df1 = pd.DataFrame(np.arange(9.).reshape((3,3)), 
                  columns = list('abc'),
                  index = ['ferozpur', 'faridkot', 'montréal'])

df2 = pd.DataFrame(np.arange(12.).reshape((4,3)),
                  columns = list('abc'),
                   index = ['faridkot', 'toronto', 'québec', 'montréal'])

df1

	a	b	c
ferozpur	0.0	1.0	2.0
faridkot	3.0	4.0	5.0
montréal	6.0	7.0	8.0

df2

	a	b	c
faridkot	0.0	1.0	2.0
toronto	3.0	4.0	5.0
québec	6.0	7.0	8.0
montréal	9.0	10.0	11.0

df1 + df2  #because the columns were same, it added those numbers

	a	b	c
faridkot	3.0	5.0	7.0
ferozpur	NaN	NaN	NaN
montréal	15.0	17.0	19.0
québec	NaN	NaN	NaN
toronto	NaN	NaN	NaN

# changing columns names will give all NAN (null values)
df3 = pd.DataFrame(np.arange(12.).reshape((4,3)),
                  columns = list('xyz'),
                   index = ['faridkot', 'toronto', 'québec', 'montréal'])

df1 + df3

	a	b	c	x	y	z
faridkot	NaN	NaN	NaN	NaN	NaN	NaN
ferozpur	NaN	NaN	NaN	NaN	NaN	NaN
montréal	NaN	NaN	NaN	NaN	NaN	NaN
québec	NaN	NaN	NaN	NaN	NaN	NaN
toronto	NaN	NaN	NaN	NaN	NaN	NaN

Arithmetic methods with fill values

df2

	a	b	c
faridkot	0.0	1.0	2.0
toronto	3.0	4.0	5.0
québec	6.0	7.0	8.0
montréal	9.0	10.0	11.0

df2.loc['faridkot', 'y'] = np.nan

df2

	a	b	c	y
faridkot	0.0	1.0	2.0	NaN
toronto	3.0	4.0	5.0	NaN
québec	6.0	7.0	8.0	NaN
montréal	9.0	10.0	11.0	NaN

help(pd.DataFrame._drop_axis)

Help on function _drop_axis in module pandas.core.generic:

_drop_axis(self: 'NDFrameT', labels, axis, level=None, errors: 'IgnoreRaise' = 'raise', only_slice: 'bool_t' = False) -> 'NDFrameT'
    Drop labels from specified axis. Used in the ``drop`` method
    internally.
    
    Parameters
    ----------
    labels : single label or list-like
    axis : int or axis name
    level : int or level name, default None
        For MultiIndex
    errors : {'ignore', 'raise'}, default 'raise'
        If 'ignore', suppress error and existing labels are dropped.
    only_slice : bool, default False
        Whether indexing along columns should be view-only.

help(pd.DataFrame.drop)

print(dir(DataFrame))

['T', '_AXIS_LEN', '_AXIS_ORDERS', '_AXIS_TO_AXIS_NUMBER', '_HANDLED_TYPES', '__abs__', '__add__', '__and__', '__annotations__', '__array__', '__array_priority__', '__array_ufunc__', '__bool__', '__class__', '__contains__', '__copy__', '__dataframe__', '__deepcopy__', '__delattr__', '__delitem__', '__dict__', '__dir__', '__divmod__', '__doc__', '__eq__', '__finalize__', '__floordiv__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__iadd__', '__iand__', '__ifloordiv__', '__imod__', '__imul__', '__init__', '__init_subclass__', '__invert__', '__ior__', '__ipow__', '__isub__', '__iter__', '__itruediv__', '__ixor__', '__le__', '__len__', '__lt__', '__matmul__', '__mod__', '__module__', '__mul__', '__ne__', '__neg__', '__new__', '__nonzero__', '__or__', '__pos__', '__pow__', '__radd__', '__rand__', '__rdivmod__', '__reduce__', '__reduce_ex__', '__repr__', '__rfloordiv__', '__rmatmul__', '__rmod__', '__rmul__', '__ror__', '__round__', '__rpow__', '__rsub__', '__rtruediv__', '__rxor__', '__setattr__', '__setitem__', '__setstate__', '__sizeof__', '__str__', '__sub__', '__subclasshook__', '__truediv__', '__weakref__', '__xor__', '_accessors', '_accum_func', '_add_numeric_operations', '_agg_examples_doc', '_agg_summary_and_see_also_doc', '_align_frame', '_align_series', '_append', '_arith_method', '_as_manager', '_box_col_values', '_can_fast_transpose', '_check_inplace_and_allows_duplicate_labels', '_check_inplace_setting', '_check_is_chained_assignment_possible', '_check_label_or_level_ambiguity', '_check_setitem_copy', '_clear_item_cache', '_clip_with_one_bound', '_clip_with_scalar', '_cmp_method', '_combine_frame', '_consolidate', '_consolidate_inplace', '_construct_axes_dict', '_construct_result', '_constructor', '_constructor_sliced', '_create_data_for_split_and_tight_to_dict', '_data', '_dir_additions', '_dir_deletions', '_dispatch_frame_op', '_drop_axis', '_drop_labels_or_levels', '_ensure_valid_index', '_find_valid_index', '_from_arrays', '_get_agg_axis', '_get_axis', '_get_axis_name', '_get_axis_number', '_get_axis_resolvers', '_get_block_manager_axis', '_get_bool_data', '_get_cleaned_column_resolvers', '_get_column_array', '_get_index_resolvers', '_get_item_cache', '_get_label_or_level_values', '_get_numeric_data', '_get_value', '_getitem_bool_array', '_getitem_multilevel', '_getitem_nocopy', '_gotitem', '_hidden_attrs', '_indexed_same', '_info_axis', '_info_axis_name', '_info_axis_number', '_info_repr', '_init_mgr', '_inplace_method', '_internal_names', '_internal_names_set', '_is_copy', '_is_homogeneous_type', '_is_label_or_level_reference', '_is_label_reference', '_is_level_reference', '_is_mixed_type', '_is_view', '_iset_item', '_iset_item_mgr', '_iset_not_inplace', '_iter_column_arrays', '_ixs', '_join_compat', '_logical_func', '_logical_method', '_maybe_cache_changed', '_maybe_update_cacher', '_metadata', '_min_count_stat_function', '_needs_reindex_multi', '_protect_consolidate', '_reduce', '_reduce_axis1', '_reindex_axes', '_reindex_columns', '_reindex_index', '_reindex_multi', '_reindex_with_indexers', '_rename', '_replace_columnwise', '_repr_data_resource_', '_repr_fits_horizontal_', '_repr_fits_vertical_', '_repr_html_', '_repr_latex_', '_reset_cache', '_reset_cacher', '_sanitize_column', '_series', '_set_axis', '_set_axis_name', '_set_axis_nocheck', '_set_is_copy', '_set_item', '_set_item_frame_value', '_set_item_mgr', '_set_value', '_setitem_array', '_setitem_frame', '_setitem_slice', '_slice', '_stat_axis', '_stat_axis_name', '_stat_axis_number', '_stat_function', '_stat_function_ddof', '_take', '_take_with_is_copy', '_to_dict_of_blocks', '_to_latex_via_styler', '_typ', '_update_inplace', '_validate_dtype', '_values', '_where', 'abs', 'add', 'add_prefix', 'add_suffix', 'agg', 'aggregate', 'align', 'all', 'any', 'apply', 'applymap', 'asfreq', 'asof', 'assign', 'astype', 'at', 'at_time', 'attrs', 'axes', 'backfill', 'between_time', 'bfill', 'bool', 'boxplot', 'clip', 'columns', 'combine', 'combine_first', 'compare', 'convert_dtypes', 'copy', 'corr', 'corrwith', 'count', 'cov', 'cummax', 'cummin', 'cumprod', 'cumsum', 'describe', 'diff', 'div', 'divide', 'dot', 'drop', 'drop_duplicates', 'droplevel', 'dropna', 'dtypes', 'duplicated', 'empty', 'eq', 'equals', 'eval', 'ewm', 'expanding', 'explode', 'ffill', 'fillna', 'filter', 'first', 'first_valid_index', 'flags', 'floordiv', 'from_dict', 'from_records', 'ge', 'get', 'groupby', 'gt', 'head', 'hist', 'iat', 'idxmax', 'idxmin', 'iloc', 'index', 'infer_objects', 'info', 'insert', 'interpolate', 'isetitem', 'isin', 'isna', 'isnull', 'items', 'iterrows', 'itertuples', 'join', 'keys', 'kurt', 'kurtosis', 'last', 'last_valid_index', 'le', 'loc', 'lt', 'mask', 'max', 'mean', 'median', 'melt', 'memory_usage', 'merge', 'min', 'mod', 'mode', 'mul', 'multiply', 'ndim', 'ne', 'nlargest', 'notna', 'notnull', 'nsmallest', 'nunique', 'pad', 'pct_change', 'pipe', 'pivot', 'pivot_table', 'plot', 'pop', 'pow', 'prod', 'product', 'quantile', 'query', 'radd', 'rank', 'rdiv', 'reindex', 'reindex_like', 'rename', 'rename_axis', 'reorder_levels', 'replace', 'resample', 'reset_index', 'rfloordiv', 'rmod', 'rmul', 'rolling', 'round', 'rpow', 'rsub', 'rtruediv', 'sample', 'select_dtypes', 'sem', 'set_axis', 'set_flags', 'set_index', 'shape', 'shift', 'size', 'skew', 'sort_index', 'sort_values', 'sparse', 'squeeze', 'stack', 'std', 'style', 'sub', 'subtract', 'sum', 'swapaxes', 'swaplevel', 'tail', 'take', 'to_clipboard', 'to_csv', 'to_dict', 'to_excel', 'to_feather', 'to_gbq', 'to_hdf', 'to_html', 'to_json', 'to_latex', 'to_markdown', 'to_numpy', 'to_orc', 'to_parquet', 'to_period', 'to_pickle', 'to_records', 'to_sql', 'to_stata', 'to_string', 'to_timestamp', 'to_xarray', 'to_xml', 'transform', 'transpose', 'truediv', 'truncate', 'tz_convert', 'tz_localize', 'unstack', 'update', 'value_counts', 'values', 'var', 'where', 'xs']

help(pd.DataFrame.describe)

help(pd.DataFrame._drop_axis)

df4 = df2

df4

	a	b	c	y
faridkot	0.0	1.0	2.0	NaN
toronto	3.0	4.0	5.0	NaN
québec	6.0	7.0	8.0	NaN
montréal	9.0	10.0	11.0	NaN

df1 + df4

	a	b	c	y
faridkot	3.0	5.0	7.0	NaN
ferozpur	NaN	NaN	NaN	NaN
montréal	15.0	17.0	19.0	NaN
québec	NaN	NaN	NaN	NaN
toronto	NaN	NaN	NaN	NaN

df4.fill_value = 0

df4

	a	b	c	y
faridkot	0.0	1.0	2.0	NaN
toronto	3.0	4.0	5.0	NaN
québec	6.0	7.0	8.0	NaN
montréal	9.0	10.0	11.0	NaN

1/df4

	a	b	c	y
faridkot	inf	1.000000	0.500000	NaN
toronto	0.333333	0.250000	0.200000	NaN
québec	0.166667	0.142857	0.125000	NaN
montréal	0.111111	0.100000	0.090909	NaN

df4.rdiv(1)

	a	b	c	y
faridkot	inf	1.000000	0.500000	NaN
toronto	0.333333	0.250000	0.200000	NaN
québec	0.166667	0.142857	0.125000	NaN
montréal	0.111111	0.100000	0.090909	NaN

df4.reindex(columns = df4.columns, fill_value=0) # not working

	a	b	c	y
faridkot	0.0	1.0	2.0	NaN
toronto	3.0	4.0	5.0	NaN
québec	6.0	7.0	8.0	NaN
montréal	9.0	10.0	11.0	NaN

Operations between DataFrame and Series

arr = np.arange(12.).reshape((3,4))

arr

array([[ 0.,  1.,  2.,  3.],
       [ 4.,  5.,  6.,  7.],
       [ 8.,  9., 10., 11.]])

arr[0]

array([0., 1., 2., 3.])

# broadcasting

arr - arr[0]   #subtracts from all rows

array([[0., 0., 0., 0.],
       [4., 4., 4., 4.],
       [8., 8., 8., 8.]])

frame

	ohio	texas	burmingham
a	0	1	2
b	3	4	5
c	6	7	8

help(pd.Series)

series

series1 = pd.Series(data = np.arange(3), index = ['a', 'b', 'c'])

series1

a    0
b    1
c    2
dtype: int32

frame-series1

	a	b	burmingham	c	ohio	texas
a	NaN	NaN	NaN	NaN	NaN	NaN
b	NaN	NaN	NaN	NaN	NaN	NaN
c	NaN	NaN	NaN	NaN	NaN	NaN

frame + series2

	a	b	burmingham	c	ohio	texas
a	NaN	NaN	NaN	NaN	NaN	NaN
b	NaN	NaN	NaN	NaN	NaN	NaN
c	NaN	NaN	NaN	NaN	NaN	NaN

Function application and ‘mapping’

frame2 = pd.DataFrame(np.random.standard_normal((4,3)),
                     columns = list('bde'),
                     index = ['utah', 'faridkot', 'shahkot', 'malsahian'])
frame2

	b	d	e
utah	-1.549165	0.443756	1.013167
faridkot	1.130587	-1.289388	-1.210530
shahkot	1.195553	0.274397	0.510043
malsahian	0.713024	-1.223282	1.857681

np.abs(frame2)  #converts non-negative values to positive

	b	d	e
utah	1.549165	0.443756	1.013167
faridkot	1.130587	1.289388	1.210530
shahkot	1.195553	0.274397	0.510043
malsahian	0.713024	1.223282	1.857681

help(np.abs)

Help on ufunc:

absolute = <ufunc 'absolute'>
    absolute(x, /, out=None, *, where=True, casting='same_kind', order='K', dtype=None, subok=True[, signature, extobj])
    
    Calculate the absolute value element-wise.
    
    ``np.abs`` is a shorthand for this function.
    
    Parameters
    ----------
    x : array_like
        Input array.
    out : ndarray, None, or tuple of ndarray and None, optional
        A location into which the result is stored. If provided, it must have
        a shape that the inputs broadcast to. If not provided or None,
        a freshly-allocated array is returned. A tuple (possible only as a
        keyword argument) must have length equal to the number of outputs.
    where : array_like, optional
        This condition is broadcast over the input. At locations where the
        condition is True, the `out` array will be set to the ufunc result.
        Elsewhere, the `out` array will retain its original value.
        Note that if an uninitialized `out` array is created via the default
        ``out=None``, locations within it where the condition is False will
        remain uninitialized.
    **kwargs
        For other keyword-only arguments, see the
        :ref:`ufunc docs <ufuncs.kwargs>`.
    
    Returns
    -------
    absolute : ndarray
        An ndarray containing the absolute value of
        each element in `x`.  For complex input, ``a + ib``, the
        absolute value is :math:`\sqrt{ a^2 + b^2 }`.
        This is a scalar if `x` is a scalar.
    
    Examples
    --------
    >>> x = np.array([-1.2, 1.2])
    >>> np.absolute(x)
    array([ 1.2,  1.2])
    >>> np.absolute(1.2 + 1j)
    1.5620499351813308
    
    Plot the function over ``[-10, 10]``:
    
    >>> import matplotlib.pyplot as plt
    
    >>> x = np.linspace(start=-10, stop=10, num=101)
    >>> plt.plot(x, np.absolute(x))
    >>> plt.show()
    
    Plot the function over the complex plane:
    
    >>> xx = x + 1j * x[:, np.newaxis]
    >>> plt.imshow(np.abs(xx), extent=[-10, 10, -10, 10], cmap='gray')
    >>> plt.show()
    
    The `abs` function can be used as a shorthand for ``np.absolute`` on
    ndarrays.
    
    >>> x = np.array([-1.2, 1.2])
    >>> abs(x)
    array([1.2, 1.2])

#DataFrame's apply method

def f1(x):
    return x.max() - x.min()

frame2.apply(f1)

b    2.744718
d    1.733145
e    3.068212
dtype: float64

# applying across columns

frame2.apply(f1, axis = 'columns')

utah         2.562332
faridkot     2.419976
shahkot      0.921156
malsahian    3.080963
dtype: float64

# modifying the function to return Series with multiple values

def f2(x):
    return pd.Series([x.min(), x.max()], index= ['min', 'max'])

frame.apply(f2)

	ohio	texas	burmingham
min	0	1	2
max	6	7	8

frame

	ohio	texas	burmingham
a	0	1	2
b	3	4	5
c	6	7	8

frame2.apply(f2)

	b	d	e
min	-1.549165	-1.289388	-1.210530
max	1.195553	0.443756	1.857681

# apply map function

def my_format(x):
    return f"{x:.2f}"

frame2.applymap(my_format)

	b	d	e
utah	-1.55	0.44	1.01
faridkot	1.13	-1.29	-1.21
shahkot	1.20	0.27	0.51
malsahian	0.71	-1.22	1.86

# applying map function in Series

frame2['e'].map(my_format)

utah          1.01
faridkot     -1.21
shahkot       0.51
malsahian     1.86
Name: e, dtype: object

Sorting and Ranking

obj2 = pd.Series(np.arange(4), index = ['d', 'a', 'b', 'c'])

obj2

d    0
a    1
b    2
c    3
dtype: int32

obj2.sort_index()

a    1
b    2
c    3
d    0
dtype: int32

# sorting in DataFrame can be done with either axis

frame = pd.DataFrame(np.arange(8).reshape((2, 4)),
                     index = ['three', 'one'],
                     columns = ['d', 'a', 'b', 'c'])


frame

	d	a	b	c
three	0	1	2	3
one	4	5	6	7

frame.sort_index()

	d	a	b	c
one	4	5	6	7
three	0	1	2	3

frame.sort_index(axis= 'columns')

	a	b	c	d
three	1	2	3	0
one	5	6	7	4

# data can be stored in descending order aswell

frame.sort_index(axis = 'columns', ascending = False)

	d	c	b	a
three	0	3	2	1
one	4	7	6	5

# sorting a series by its values

obj = pd.Series([4, 7, -3, -2])

obj.sort_values()

2   -3
3   -2
0    4
1    7
dtype: int64

# missing values get sorted to the end by default

obj = pd.Series([4, 3, 4, np.nan, 33, np.nan, -3, 3])

obj.sort_values()

6    -3.0
1     3.0
7     3.0
0     4.0
2     4.0
4    33.0
3     NaN
5     NaN
dtype: float64

# using na_position to bring missing values to the front
obj.sort_values(na_position= 'first')

3     NaN
5     NaN
6    -3.0
1     3.0
7     3.0
0     4.0
2     4.0
4    33.0
dtype: float64

# while sorting a DataFrame 

frame = pd.DataFrame({'b': [1, 2, 3, 4, 5], 'a':[3, 43, 33, 1, 5]})

frame

	b	a
0	1	3
1	2	43
2	3	33
3	4	1
4	5	5

frame.sort_values('a')

	b	a
3	4	1
0	1	3
4	5	5
2	3	33
1	2	43

# ranking
obj = pd.Series([4, 5, -5, 7, 8, 0, 4])

obj.rank()

0    3.5
1    5.0
2    1.0
3    6.0
4    7.0
5    2.0
6    3.5
dtype: float64

# ranking in order the data is observed

obj.rank(method='first')

0    3.0
1    5.0
2    1.0
3    6.0
4    7.0
5    2.0
6    4.0
dtype: float64

obj.rank(ascending = False)

0    4.5
1    3.0
2    7.0
3    2.0
4    1.0
5    6.0
6    4.5
dtype: float64

# DataFrame for rank computation

frame

	b	a
0	1	3
1	2	43
2	3	33
3	4	1
4	5	5

frame.rank(axis = 'columns')

	b	a
0	1.0	2.0
1	1.0	2.0
2	1.0	2.0
3	2.0	1.0
4	1.5	1.5

Axis indices with duplicate labels

obj = pd.Series(np.arange(5), index=['a', 'a', 'b','b', 'c' ])

obj

a    0
a    1
b    2
b    3
c    4
dtype: int32

obj.index.is_unique

False

obj['a']

a    0
a    1
dtype: int32

obj['c']

# DataFrame
df = pd.DataFrame(np.random.standard_normal((5, 3)),
                 index = ['a', 'a', 'b', 'c', 'b'])

df

	0	1	2
a	-1.817751	0.915854	-0.389590
a	0.603020	0.573012	1.070691
b	-0.903033	1.109707	0.874381
c	2.529357	-1.169854	0.676702
b	-0.368763	0.723758	0.375079

df.loc['b']

	0	1	2
b	-0.903033	1.109707	0.874381
b	-0.368763	0.723758	0.375079

df.loc['c']

0    2.529357
1   -1.169854
2    0.676702
Name: c, dtype: float64

Descriptive statistics

df.sum()

0    0.042830
1    2.152477
2    2.607263
dtype: float64

df.sum(axis = 'columns')

a   -1.291487
a    2.246723
b    1.081055
c    2.036205
b    0.730074
dtype: float64

df.describe()

	0	1	2
count	5.000000	5.000000	5.000000
mean	0.008566	0.430495	0.521453
std	1.659562	0.917106	0.570471
min	-1.817751	-1.169854	-0.389590
25%	-0.903033	0.573012	0.375079
50%	-0.368763	0.723758	0.676702
75%	0.603020	0.915854	0.874381
max	2.529357	1.109707	1.070691

df.cumsum()

	0	1	2
a	-1.817751	0.915854	-0.389590
a	-1.214731	1.488866	0.681101
b	-2.117763	2.598573	1.555481
c	0.411594	1.428719	2.232183
b	0.042830	2.152477	2.607263

Unique Values, Value counts, and Membership

obj = pd.Series(['c', 'd', 'a', 'b', 'n', 'm', 'g', "k", 'b', 'c', 'k'])

uniques = obj.unique()

uniques

array(['c', 'd', 'a', 'b', 'n', 'm', 'g', 'k'], dtype=object)

obj.value_counts()

c    2
b    2
k    2
d    1
a    1
n    1
m    1
g    1
Name: count, dtype: int64

pd.value_counts(obj.to_numpy(), sort = False)

c    2
d    1
a    1
b    2
n    1
m    1
g    1
k    2
Name: count, dtype: int64

# 'isin' is used for vectorized set memebership

obj

0     c
1     d
2     a
3     b
4     n
5     m
6     g
7     k
8     b
9     c
10    k
dtype: object

mask = obj.isin(['b', 'c'])

mask

0      True
1     False
2     False
3      True
4     False
5     False
6     False
7     False
8      True
9      True
10    False
dtype: bool

obj[mask]

0    c
3    b
8    b
9    c
dtype: object

to_match = pd.Series(['c', 'a', 'b', 'b', 'c'])

unique_vals = pd.Series(['c', 'b', 'a'])

indices = pd.Index(unique_vals).get_indexer(to_match)

indices

array([0, 2, 1, 1, 0], dtype=int64)

data = pd.DataFrame({'ça va': [1, 2, 3, 4, 5],
                    'oui, ça va' : [43,3, 2, 4, 2], 
                    'et toi': [3, 2, 44, 1, 5]})

data

	ça va	oui, ça va	et toi
0	1	43	3
1	2	3	2
2	3	2	44
3	4	4	1
4	5	2	5

# computing value counts
data['ça va'].value_counts().sort_index()

ça va
1    1
2    1
3    1
4    1
5    1
Name: count, dtype: int64

data['et toi'].value_counts().sort_index()

et toi
1     1
2     1
3     1
5     1
44    1
Name: count, dtype: int64

result = data.apply(pd.value_counts).fillna(0)

result

	ça va	oui, ça va	et toi
1	1.0	0.0	1.0
2	1.0	2.0	1.0
3	1.0	1.0	1.0
4	1.0	1.0	0.0
5	1.0	0.0	1.0
43	0.0	1.0	0.0
44	0.0	0.0	1.0