import pandas as pd
import numpy as np
Data Wrangling: Join,Combine, and Reshape
Hierarchical Indexing
- Reordering and Sorting levels
- Summary statitics by level
- Indexing with DataFrame’s Columns
Combining and Merging Datasets
- Database-Style DataFrame joins
- Merging on Index
- Concatenating Along an Axis
- Combining Data with Overlap
Reshaping and Pivoting
- Reshaping with hierarchical Indexing
- Pivoting ‘long’ to ‘wide’ format
- pivoting ‘wide’ to ‘long’ format
Hierarchical Indexing (Series)
= pd.Series(np.random.uniform(size = 9),
data = [['a', 'a', 'b', 'c', 'c', 'b', 'c', 'b','a'],
index 1, 2, 3, 1, 3, 4, 3, 2, 1]])
[ data
a 1 0.684862
2 0.701188
b 3 0.870829
c 1 0.958994
3 0.042434
b 4 0.539591
c 3 0.668997
b 2 0.501304
a 1 0.260682
dtype: float64
# gaps for 'multi-index'
data.index
MultiIndex([('a', 1),
('a', 2),
('b', 3),
('c', 1),
('c', 3),
('b', 4),
('c', 3),
('b', 2),
('a', 1)],
)
= [0, 0]
mean = [[1,0], [0, 100]] cov
data
a 1 0.684862
2 0.701188
b 3 0.870829
c 1 0.958994
3 0.042434
b 4 0.539591
c 3 0.668997
b 2 0.501304
a 1 0.260682
dtype: float64
# selecting subset
'b'] data[
3 0.870829
4 0.539591
2 0.501304
dtype: float64
# selecting the data values with loc operator
'a','b']] data.loc[[
a 1 0.684862
2 0.701188
1 0.260682
b 3 0.870829
4 0.539591
2 0.501304
dtype: float64
2] data.loc[:,
a 0.701188
b 0.501304
dtype: float64
Hierarchical index (DataFrame)
= pd.DataFrame(np.arange(12).reshape((4,3)),
frame = [["a", "a", "b", "b"], [1, 2, 1, 2]],
index = [['fdk', 'fzp', 'chd'],
columns 'PB', 'PB', 'CHD']]) [
frame
fdk | fzp | chd | ||
---|---|---|---|---|
PB | PB | CHD | ||
a | 1 | 0 | 1 | 2 |
2 | 3 | 4 | 5 | |
b | 1 | 6 | 7 | 8 |
2 | 9 | 10 | 11 |
= ['key1', 'key2'] frame.index.names
= ['city', 'province'] frame.columns.names
frame
city | fdk | fzp | chd | |
---|---|---|---|---|
province | PB | PB | CHD | |
key1 | key2 | |||
a | 1 | 0 | 1 | 2 |
2 | 3 | 4 | 5 | |
b | 1 | 6 | 7 | 8 |
2 | 9 | 10 | 11 |
# to check how many levels an index has
frame.index.nlevels
2
# partial column indexing
'fdk'] frame[
province | PB | |
---|---|---|
key1 | key2 | |
a | 1 | 0 |
2 | 3 | |
b | 1 | 6 |
2 | 9 |
'fzp'] frame[
province | PB | |
---|---|---|
key1 | key2 | |
a | 1 | 1 |
2 | 4 | |
b | 1 | 7 |
2 | 10 |
'chd'] frame[
province | CHD | |
---|---|---|
key1 | key2 | |
a | 1 | 2 |
2 | 5 | |
b | 1 | 8 |
2 | 11 |
'fdk', 'fzp', 'chd'],
pd.MultiIndex.from_arrays([['PB', 'PB', 'CHD'],
[=['city', 'capital'])
names
Reordering and Sorting levels
'key1', 'key2') frame.swaplevel(
city | fdk | fzp | chd | |
---|---|---|---|---|
province | PB | PB | CHD | |
key2 | key1 | |||
1 | a | 0 | 1 | 2 |
2 | a | 3 | 4 | 5 |
1 | b | 6 | 7 | 8 |
2 | b | 9 | 10 | 11 |
=1) frame.sort_index(level
city | fdk | fzp | chd | |
---|---|---|---|---|
province | PB | PB | CHD | |
key1 | key2 | |||
a | 1 | 0 | 1 | 2 |
b | 1 | 6 | 7 | 8 |
a | 2 | 3 | 4 | 5 |
b | 2 | 9 | 10 | 11 |
0,1).sort_index(level=0) frame.swaplevel(
city | fdk | fzp | chd | |
---|---|---|---|---|
province | PB | PB | CHD | |
key2 | key1 | |||
1 | a | 0 | 1 | 2 |
b | 6 | 7 | 8 | |
2 | a | 3 | 4 | 5 |
b | 9 | 10 | 11 |
Summary Statistics by Level
='key2').sum() frame.groupby(level
city | fdk | fzp | chd |
---|---|---|---|
province | PB | PB | CHD |
key2 | |||
1 | 6 | 8 | 10 |
2 | 12 | 14 | 16 |
= 'province', axis = 'columns').sum() frame.groupby(level
province | CHD | PB | |
---|---|---|---|
key1 | key2 | ||
a | 1 | 2 | 1 |
2 | 5 | 7 | |
b | 1 | 8 | 13 |
2 | 11 | 19 |
Indexing with a DataFrame’s columns
= pd.DataFrame({'a': range(7), 'b': range(7,0,-1),
frame2 'c': ['one', 'one', 'one', 'two', 'two',
'two', 'two'],
'd': [0, 1,2,0,1,3,2]})
frame2
a | b | c | d | |
---|---|---|---|---|
0 | 0 | 7 | one | 0 |
1 | 1 | 6 | one | 1 |
2 | 2 | 5 | one | 2 |
3 | 3 | 4 | two | 0 |
4 | 4 | 3 | two | 1 |
5 | 5 | 2 | two | 3 |
6 | 6 | 1 | two | 2 |
# set_index to create a new DataFrame
= frame2.set_index(['c', 'd'])
frame3
frame3
a | b | ||
---|---|---|---|
c | d | ||
one | 0 | 0 | 7 |
1 | 1 | 6 | |
2 | 2 | 5 | |
two | 0 | 3 | 4 |
1 | 4 | 3 | |
3 | 5 | 2 | |
2 | 6 | 1 |
# we can set it to index by doing drop= False
"c",'d'], drop= False) frame2.set_index([
a | b | c | d | ||
---|---|---|---|---|---|
c | d | ||||
one | 0 | 0 | 7 | one | 0 |
1 | 1 | 6 | one | 1 | |
2 | 2 | 5 | one | 2 | |
two | 0 | 3 | 4 | two | 0 |
1 | 4 | 3 | two | 1 | |
3 | 5 | 2 | two | 3 | |
2 | 6 | 1 | two | 2 |
# reset_index brings it back to the orignal position
frame2.reset_index()
index | a | b | c | d | |
---|---|---|---|---|---|
0 | 0 | 0 | 7 | one | 0 |
1 | 1 | 1 | 6 | one | 1 |
2 | 2 | 2 | 5 | one | 2 |
3 | 3 | 3 | 4 | two | 0 |
4 | 4 | 4 | 3 | two | 1 |
5 | 5 | 5 | 2 | two | 3 |
6 | 6 | 6 | 1 | two | 2 |
Combining and Merging Datasets
- pandas.merge (connects rows based on one/more keys) how
- pandas.concat (stacks objects together on axis)
- combine_first (slice together overlapping data to fill missing values)
- merge function arguments
# DataFrame joins
= pd.DataFrame({"key": ['a', 'c', 'd', 'b', 'a', 'c'],
df1 'data1': pd.Series(range(6), dtype= 'Int64')})
= pd.DataFrame({'key': ['a', 'b', 'c'],
df2 'data2': pd.Series(range(3), dtype='Int64')})
df1
key | data1 | |
---|---|---|
0 | a | 0 |
1 | c | 1 |
2 | d | 2 |
3 | b | 3 |
4 | a | 4 |
5 | c | 5 |
df2
key | data2 | |
---|---|---|
0 | a | 0 |
1 | b | 1 |
2 | c | 2 |
pd.merge(df1, df2)
key | data1 | data2 | |
---|---|---|---|
0 | a | 0 | 0 |
1 | a | 4 | 0 |
2 | c | 1 | 2 |
3 | c | 5 | 2 |
4 | b | 3 | 1 |
# specifying the column
= 'key') pd.merge(df1, df2, on
key | data1 | data2 | |
---|---|---|---|
0 | a | 0 | 0 |
1 | a | 4 | 0 |
2 | c | 1 | 2 |
3 | c | 5 | 2 |
4 | b | 3 | 1 |
= 'outer') pd.merge(df1, df2, how
key | data1 | data2 | |
---|---|---|---|
0 | a | 0 | 0 |
1 | a | 4 | 0 |
2 | c | 1 | 2 |
3 | c | 5 | 2 |
4 | d | 2 | <NA> |
5 | b | 3 | 1 |
Renaming Axis Indexes
= 'key', suffixes = ("_left", "_right")) pd.merge(df1, df2, on
key | data1 | data2 | |
---|---|---|---|
0 | a | 0 | 0 |
1 | a | 4 | 0 |
2 | c | 1 | 2 |
3 | c | 5 | 2 |
4 | b | 3 | 1 |
Join instance
= 'key') df1.join(df1, on
= pd.DataFrame([[7., 8.], [9., 10.],
another 11., 12.], [16., 17.]],
[= ['a', 'c', 'e', 'f'],
index = ['jandiala', 'faridkot']) columns
another
jandiala | faridkot | |
---|---|---|
a | 7.0 | 8.0 |
c | 9.0 | 10.0 |
e | 11.0 | 12.0 |
f | 16.0 | 17.0 |
= 'outer') df1.join(another, how
key | data1 | jandiala | faridkot | |
---|---|---|---|---|
0 | a | 0 | NaN | NaN |
1 | c | 1 | NaN | NaN |
2 | d | 2 | NaN | NaN |
3 | b | 3 | NaN | NaN |
4 | a | 4 | NaN | NaN |
5 | c | 5 | NaN | NaN |
a | NaN | <NA> | 7.0 | 8.0 |
c | NaN | <NA> | 9.0 | 10.0 |
e | NaN | <NA> | 11.0 | 12.0 |
f | NaN | <NA> | 16.0 | 17.0 |
Concatinating along the axis
data combination
function agruments pandas.concat
= np.arange(12).reshape((3,4)) arr
arr
array([[ 0, 1, 2, 3],
[ 4, 5, 6, 7],
[ 8, 9, 10, 11]])
np.concatenate([arr, arr])
array([[ 0, 1, 2, 3],
[ 4, 5, 6, 7],
[ 8, 9, 10, 11],
[ 0, 1, 2, 3],
[ 4, 5, 6, 7],
[ 8, 9, 10, 11]])
= 1) np.concatenate([arr, arr], axis
array([[ 0, 1, 2, 3, 0, 1, 2, 3],
[ 4, 5, 6, 7, 4, 5, 6, 7],
[ 8, 9, 10, 11, 8, 9, 10, 11]])
## series with no index overlap
= pd.Series([0, 1], index = ['a', 'b'], dtype = 'Int64')
s1 = pd.Series([2,3,4], index = ['c', 'd', 'e'], dtype= 'Int64')
s2 = pd.Series([5,6], index =['e', 'f'], dtype = 'Int64') s3
s1
a 0
b 1
dtype: Int64
s2
c 2
d 3
e 4
dtype: Int64
s3
e 5
f 6
dtype: Int64
pd.concat([s1, s2, s3])
a 0
b 1
c 2
d 3
e 4
e 5
f 6
dtype: Int64
# the result will be a DataFrame if we pass axis = 'columns'
= 'columns') pd.concat([s1, s2, s3], axis
0 | 1 | 2 | |
---|---|---|---|
a | 0 | <NA> | <NA> |
b | 1 | <NA> | <NA> |
c | <NA> | 2 | <NA> |
d | <NA> | 3 | <NA> |
e | <NA> | 4 | 5 |
f | <NA> | <NA> | 6 |
# trying inner join()
= pd.concat([s1, s3]) s4
s4
a 0
b 1
e 5
f 6
dtype: Int64
= 'columns') pd.concat([s1, s4], axis
0 | 1 | |
---|---|---|
a | 0 | 0 |
b | 1 | 1 |
e | <NA> | 5 |
f | <NA> | 6 |
# because of inner join, labels 'f' and 'g' disappeared
= 'columns', join = 'inner') pd.concat([s1, s4], axis
0 | 1 | |
---|---|---|
a | 0 | 0 |
b | 1 | 1 |
= pd.concat([s1, s1, s3], keys= ['one', 'two', 'three']) result
result
one a 0
b 1
two a 0
b 1
three e 5
f 6
dtype: Int64
result.unstack()
a | b | e | f | |
---|---|---|---|---|
one | 0 | 1 | <NA> | <NA> |
two | 0 | 1 | <NA> | <NA> |
three | <NA> | <NA> | 5 | 6 |
- in case of combining Series along axis= ‘columns’, > keys become DataFrame column headers
= 'columns',
pd.concat([s1, s2, s3], axis = ['one', 'two', 'three']) keys
one | two | three | |
---|---|---|---|
a | 0 | <NA> | <NA> |
b | 1 | <NA> | <NA> |
c | <NA> | 2 | <NA> |
d | <NA> | 3 | <NA> |
e | <NA> | 4 | 5 |
f | <NA> | <NA> | 6 |
# same logic extends to DataFrame objects
= 'columns') pd.concat([df1, df2], axis
key | data1 | key | data2 | |
---|---|---|---|---|
0 | a | 0 | a | 0 |
1 | c | 1 | b | 1 |
2 | d | 2 | c | 2 |
3 | b | 3 | NaN | <NA> |
4 | a | 4 | NaN | <NA> |
5 | c | 5 | NaN | <NA> |
- In dictionary objects, the keys will be used > for key option
'level1': df1, 'level2': df2},
pd.concat({= 'columns') axis
level1 | level2 | |||
---|---|---|---|---|
key | data1 | key | data2 | |
0 | a | 0 | a | 0 |
1 | c | 1 | b | 1 |
2 | d | 2 | c | 2 |
3 | b | 3 | NaN | <NA> |
4 | a | 4 | NaN | <NA> |
5 | c | 5 | NaN | <NA> |
# additional arguments
= 'columns',
pd.concat([df1, df2], axis = ['level1', 'level2'],
keys = ['upper', 'lower']) names
upper | level1 | level2 | ||
---|---|---|---|---|
lower | key | data1 | key | data2 |
0 | a | 0 | a | 0 |
1 | c | 1 | b | 1 |
2 | d | 2 | c | 2 |
3 | b | 3 | NaN | <NA> |
4 | a | 4 | NaN | <NA> |
5 | c | 5 | NaN | <NA> |
# merging by ignoring_index in DataFrame
= pd.DataFrame(np.random.standard_normal((3, 4)),
df3 = ['a', 'b', 'c', 'd'])
columns
= pd.DataFrame(np.random.standard_normal((2,3)),
df4 = ['g', 'd', 'a'])
columns
df3
a | b | c | d | |
---|---|---|---|---|
0 | -0.692867 | -0.923164 | -1.055435 | 0.938207 |
1 | -0.060941 | 1.029882 | -0.332099 | -1.697114 |
2 | -0.274830 | 1.991366 | -0.540897 | 0.961377 |
df4
g | d | a | |
---|---|---|---|
0 | -1.397642 | 1.511266 | -0.920547 |
1 | 0.518125 | -1.409185 | -1.092790 |
= True) pd.concat([df3, df4], ignore_index
a | b | c | d | g | |
---|---|---|---|---|---|
0 | 1.711731 | -0.644975 | -0.093205 | 0.074968 | NaN |
1 | -1.397718 | -1.585621 | 0.808180 | -0.492032 | NaN |
2 | 0.923910 | 0.606571 | -1.045814 | 1.247491 | NaN |
3 | -0.905022 | NaN | NaN | -1.122829 | -0.352158 |
4 | 0.091307 | NaN | NaN | -0.122968 | -0.349629 |
Combining Data with Overlap
= pd.Series([np.nan, 2.5, 0.0, 4.5, 3, np.nan],
a = ['a', 'b', 'c', 'g', 'k', 'o'])
index
= pd.Series([0., np.nan, 3., np.nan, 5., 2.],
b = ['a', 'b', 'c', 'd', 'e', 'f']) index
a
a NaN
b 2.5
c 0.0
c 4.5
a 3.0
b NaN
dtype: float64
b
a 0.0
b NaN
c 3.0
d NaN
e 5.0
f 2.0
dtype: float64
Explanation - > selects non-null values from a or b
np.where doesnot check the index labels
better to use combine_first method
combine_first method will have the union of all column names
np.where(pd.isna(a), b, a)
array([0. , 2.5, 0. , 4.5, 3. , 2. ])
a.combine_first(b)
a 0.0
b 2.5
c 0.0
d NaN
e 5.0
f 2.0
g 4.5
k 3.0
o NaN
dtype: float64
# using combine_first on DataFrame
df1.combine_first(df2)
data1 | data2 | key | |
---|---|---|---|
0 | 0 | 0 | a |
1 | 1 | 1 | c |
2 | 2 | 2 | d |
3 | 3 | <NA> | b |
4 | 4 | <NA> | a |
5 | 5 | <NA> | c |
Reshaing and Pivoting
- stack method - rotates or pivots the columns
- unstack method - pivots the rows into columns
= pd.DataFrame(np.arange(6).reshape((2,3)),
data = pd.Index(['fdk', 'golewala'],
index = 'city'),
name = pd.Index(['one','two', 'three'],
columns = 'number')) name
data
number | one | two | three |
---|---|---|---|
city | |||
fdk | 0 | 1 | 2 |
golewala | 3 | 4 | 5 |
= data.stack()
result_stack
result_stack
city number
fdk one 0
two 1
three 2
golewala one 3
two 4
three 5
dtype: int32
result_stack.unstack()
number | one | two | three |
---|---|---|---|
city | |||
fdk | 0 | 1 | 2 |
golewala | 3 | 4 | 5 |
= 0) result_stack.unstack(level
city | fdk | golewala |
---|---|---|
number | ||
one | 0 | 3 |
two | 1 | 4 |
three | 2 | 5 |
= 'city') result_stack.unstack(level
city | fdk | golewala |
---|---|---|
number | ||
one | 0 | 3 |
two | 1 | 4 |
three | 2 | 5 |
# unstacking a DataFrame
= pd.DataFrame({'left': result_stack, 'right': result_stack+ 5},
df5 = pd.Index(['left', 'right'])) columns
df5
left | right | ||
---|---|---|---|
city | number | ||
fdk | one | 0 | 5 |
two | 1 | 6 | |
three | 2 | 7 | |
golewala | one | 3 | 8 |
two | 4 | 9 | |
three | 5 | 10 |
Pivoting ‘long’ to ‘wide’ Format
= pd.read_csv("E:\pythonfordatanalysis\\machine-readable-business-employment-data-sep-2023-quarter.csv")
data
data.head()
Series_reference | Period | Data_value | Suppressed | STATUS | UNITS | Magnitude | Subject | Group | Series_title_1 | Series_title_2 | Series_title_3 | Series_title_4 | Series_title_5 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | BDCQ.SEA1AA | 2011.06 | 80078.0 | NaN | F | Number | 0 | Business Data Collection - BDC | Industry by employment variable | Filled jobs | Agriculture, Forestry and Fishing | Actual | NaN | NaN |
1 | BDCQ.SEA1AA | 2011.09 | 78324.0 | NaN | F | Number | 0 | Business Data Collection - BDC | Industry by employment variable | Filled jobs | Agriculture, Forestry and Fishing | Actual | NaN | NaN |
2 | BDCQ.SEA1AA | 2011.12 | 85850.0 | NaN | F | Number | 0 | Business Data Collection - BDC | Industry by employment variable | Filled jobs | Agriculture, Forestry and Fishing | Actual | NaN | NaN |
3 | BDCQ.SEA1AA | 2012.03 | 90743.0 | NaN | F | Number | 0 | Business Data Collection - BDC | Industry by employment variable | Filled jobs | Agriculture, Forestry and Fishing | Actual | NaN | NaN |
4 | BDCQ.SEA1AA | 2012.06 | 81780.0 | NaN | F | Number | 0 | Business Data Collection - BDC | Industry by employment variable | Filled jobs | Agriculture, Forestry and Fishing | Actual | NaN | NaN |
= data.loc[:, ['Period', 'Group', 'Magnitude']] data2
data2.head()
Period | Group | Magnitude | |
---|---|---|---|
0 | 2011.06 | Industry by employment variable | 0 |
1 | 2011.09 | Industry by employment variable | 0 |
2 | 2011.12 | Industry by employment variable | 0 |
3 | 2012.03 | Industry by employment variable | 0 |
4 | 2012.06 | Industry by employment variable | 0 |
help(pd.PeriodIndex)
= pd.PeriodIndex(year = [2000, 2002],
divide = [1,4]) quarter
divide
PeriodIndex(['2000Q1', '2002Q4'], dtype='period[Q-DEC]')
data.columns
Index(['Series_reference', 'Period', 'Data_value', 'Suppressed', 'STATUS',
'UNITS', 'Magnitude', 'Subject', 'Group', 'Series_title_1',
'Series_title_2', 'Series_title_3', 'Series_title_4', 'Series_title_5'],
dtype='object')
= 'item' data.columns.name
data.head()
item | Series_reference | Data_value | Suppressed | STATUS | UNITS | Magnitude | Subject | Series_title_1 | Series_title_2 | Series_title_3 | Series_title_4 | Series_title_5 |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | BDCQ.SEA1AA | 80078.0 | NaN | F | Number | 0 | Business Data Collection - BDC | Filled jobs | Agriculture, Forestry and Fishing | Actual | NaN | NaN |
1 | BDCQ.SEA1AA | 78324.0 | NaN | F | Number | 0 | Business Data Collection - BDC | Filled jobs | Agriculture, Forestry and Fishing | Actual | NaN | NaN |
2 | BDCQ.SEA1AA | 85850.0 | NaN | F | Number | 0 | Business Data Collection - BDC | Filled jobs | Agriculture, Forestry and Fishing | Actual | NaN | NaN |
3 | BDCQ.SEA1AA | 90743.0 | NaN | F | Number | 0 | Business Data Collection - BDC | Filled jobs | Agriculture, Forestry and Fishing | Actual | NaN | NaN |
4 | BDCQ.SEA1AA | 81780.0 | NaN | F | Number | 0 | Business Data Collection - BDC | Filled jobs | Agriculture, Forestry and Fishing | Actual | NaN | NaN |
= (data.stack()
long_data
.reset_index()= {0:'value'})) .rename(columns
10] long_data[:
level_0 | item | value | |
---|---|---|---|
0 | 0 | Series_reference | BDCQ.SEA1AA |
1 | 0 | Data_value | 80078.0 |
2 | 0 | STATUS | F |
3 | 0 | UNITS | Number |
4 | 0 | Magnitude | 0 |
5 | 0 | Subject | Business Data Collection - BDC |
6 | 0 | Series_title_1 | Filled jobs |
7 | 0 | Series_title_2 | Agriculture, Forestry and Fishing |
8 | 0 | Series_title_3 | Actual |
9 | 1 | Series_reference | BDCQ.SEA1AA |
Pivoting ‘wide’ to ‘long’ Format
- pd.melt- using particular coloumn as a key indicator
- pd.pivot- used to reset_index to move data back to column
= pd.DataFrame({'key': ['foo', 'bar', 'xyz'],
df6 'A': [1, 3, 5],
'C': [4, 6, 3],
'D': [4, 64, 2]})
df6
key | A | C | D | |
---|---|---|---|---|
0 | foo | 1 | 4 | 4 |
1 | bar | 3 | 6 | 64 |
2 | xyz | 5 | 3 | 2 |
# using pd.melt to use key as group indicator
= pd.melt(df6, id_vars = 'key') melted
melted
key | variable | value | |
---|---|---|---|
0 | foo | A | 1 |
1 | bar | A | 3 |
2 | xyz | A | 5 |
3 | foo | C | 4 |
4 | bar | C | 6 |
5 | xyz | C | 3 |
6 | foo | D | 4 |
7 | bar | D | 64 |
8 | xyz | D | 2 |
# back to orignal
= melted.pivot(index = 'key',
reshaped = 'variable',
columns = 'value') values
reshaped
variable | A | C | D |
---|---|---|---|
key | |||
bar | 3 | 6 | 64 |
foo | 1 | 4 | 4 |
xyz | 5 | 3 | 2 |
reshaped.reset_index()
variable | key | A | C | D |
---|---|---|---|---|
0 | bar | 3 | 6 | 64 |
1 | foo | 1 | 4 | 4 |
2 | xyz | 5 | 3 | 2 |
df6
key | A | C | D | |
---|---|---|---|---|
0 | foo | 1 | 4 | 4 |
1 | bar | 3 | 6 | 64 |
2 | xyz | 5 | 3 | 2 |
# specify a subset of columns to use as a value columns
= "key", value_vars = ['A', 'C']) pd.melt(df6, id_vars
key | variable | value | |
---|---|---|---|
0 | foo | A | 1 |
1 | bar | A | 3 |
2 | xyz | A | 5 |
3 | foo | C | 4 |
4 | bar | C | 6 |
5 | xyz | C | 3 |