import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
Time Series
Date and Time Data types and tools
- Converting between Sting and Datetime
Time series basics
- Indexing, selection, subsetting
- Time series with duplicate indices
Data Ranges, Frequencies, and Shifting
- Generating Date Ranges
- Frequencies and Date offsets
- Shifting (Leading and Lagging) Data
- Shifting dates with offsets
Time Zone Handling
- Time zone Loacalization and Conversion
- Operations with Time Zone-Aware Timestamp Objects
- Operations between different time zones
Periods and Period Arithmetic
- period frequency conversion
- Quaterly period frequencies
- Converting timestamps to periods (and back)
- Creating a PeriodIndex from Arrays
Resampling and Frequency Conversion
- Donwsampling
- Open-high-low-close(OHLC) resampling
- Unsampling and Interpolation
- Resampling with periods
- Grouped time resampling
Moving window functions
- Exponentially weighted functions
- Binary moving window functions
- User-defined window functions
## Data and Time Data Types and Tools
= datetime.now()
now
now
now.year, now.month, now.day
= datetime(2011, 1, 7) - datetime(2008, 6, 3, 4, 1, 4)
delta
delta
delta.days
delta.seconds
# add or subtract a timedelta to yield a new shifted object
from datetime import timedelta
= datetime(2024, 2, 28)
start
+ timedelta(12) start
- 2 * timedelta(12) start
help(datetime)
### Converting between String and DateFrame
= datetime(2024, 1, 30)
stamp
str(stamp)
"%Y-%m-%d") stamp.strftime(
= '2024-2-28'
value '%Y-%m-%d') datetime.strptime(value,
= ["2/23/2024", "2/28/2024"]
datestrs
"%m/%d/%Y") for x in datestrs] [datetime.strptime(x,
# using pandas
= ["2024-2-23 12:00:00", "2024-2-28 00:00:00"] datestrs
pd.to_datetime(datestrs)
= pd.to_datetime(datestrs+ [None]) idx
idx
2] idx[
pd.isna(idx)
Time series basics
= [datetime(2024,2,23), datetime(2024,2,24),
dates 2024,2,25), datetime(2024,2,26),
datetime(2024,2,27), datetime(2024,2,28)] datetime(
= pd.Series(np.random.standard_normal(6),
ts = dates) index
ts
ts.index
+ ts[::2] ts
ts.index.dtype
= ts.index[0] stamp
stamp
Indexing, Selection, Subsetting
= ts.index[2] stamp
ts[stamp]
stamp
"2024-02-25"] ts[
= pd.Series(np.random.standard_normal(1000),
longer_ts = pd.date_range("2000-01-01",
index =1000)) periods
longer_ts
"2001"] longer_ts[
"2001-05"] longer_ts[
# slicing with datatime objects works aswell
2011,1,7):] ts[datetime(
="2001-01-09") ts.truncate(after
# for DataFrame
= pd.date_range("2024-01-01",
dates_df =100,
periods= "W-WED") freq
= pd.DataFrame(np.random.standard_normal((6, 4)),
long_df = dates,
index = ['Colorado', 'Texas',
columns 'New York', "Ohio"])
"2011-05-01"] long_df.loc[
Time series with duplicate indices
= pd.DatetimeIndex(["2017-12-03", "2018-11-06", "2019-11-06",
dates "2021-01-28", "2022-09-06"])
= pd.Series(np.arange(5), index=dates) dup_ts
dup_ts
# checking if the index is unique
dup_ts.index.is_unique
# unique indexes can be grouped
= dup_ts.groupby(level=0) grouped
grouped.mean()
grouped.count()
Data Ranges, Frequencies, and Shifting
ts
= ts.resample("D") resampler
resampler
Generating date ranges
= pd.date_range("2012-04-01", "2012-06-01")
index index
="2024-04-01", periods=20) pd.date_range(start
="2024-03-24", periods=20) pd.date_range(end
"2000-01-01", '2000-12-01', freq="BM") pd.date_range(
Frequencies and date offsets
from pandas.tseries.offsets import Hour, Minute
= Hour() hour
hour
= Hour(4) four_hours
four_hours
# combined addition
2) + Minute(30) Hour(
"2023-05-01", periods = 10, freq='1h30min') pd.date_range(
# week of month dates
= pd.date_range("2012-01-01", "2012-09-01",
monthly_dates ="WOM-3FRI")
freqlist(monthly_dates)
Shifting (Leading and Lagging Data)
- moving backward and forward through time
= pd.Series(np.random.standard_normal(4),
ts =pd.date_range("2022-01-01", periods=4,
index="M"))
freq ts
2) ts.shift(
-2) ts.shift(
/ts.shift(1)-1 ts
2, freq="M") ts.shift(
# groupby
= pd.Series(np.random.standard_normal(20),
ts =pd.date_range("2024-10-02",
index=20, freq="4D")) periods
ts
ts.groupby(MonthEnd().rollfoward).mean()
"M").mean() ts.resample(
2024-10-31 0.076767
2024-11-30 -0.177065
2024-12-31 0.635081
Freq: M, dtype: float64
Time Zone handling
import pytz
-5:] pytz.common_timezones[
['US/Eastern', 'US/Hawaii', 'US/Mountain', 'US/Pacific', 'UTC']
# to get timezones, use pytz.timezone
= pytz.timezone("America/New_York")
tz
tz
<DstTzInfo 'America/New_York' LMT-1 day, 19:04:00 STD>
### Time zone localization and conversion
= pd.date_range("2021-11-17 09:30", periods=6) dates
= pd.Series(np.random.standard_normal(len(dates)),
ts = dates) index
ts
2021-11-17 09:30:00 0.808643
2021-11-18 09:30:00 1.435830
2021-11-19 09:30:00 0.764818
2021-11-20 09:30:00 0.345263
2021-11-21 09:30:00 -0.671032
2021-11-22 09:30:00 0.694027
Freq: D, dtype: float64
print(ts.index.tz)
None
# Dates from timezone set
"2023-3-01", periods=10, tz="UTC") pd.date_range(
DatetimeIndex(['2023-03-01 00:00:00+00:00', '2023-03-02 00:00:00+00:00',
'2023-03-03 00:00:00+00:00', '2023-03-04 00:00:00+00:00',
'2023-03-05 00:00:00+00:00', '2023-03-06 00:00:00+00:00',
'2023-03-07 00:00:00+00:00', '2023-03-08 00:00:00+00:00',
'2023-03-09 00:00:00+00:00', '2023-03-10 00:00:00+00:00'],
dtype='datetime64[ns, UTC]', freq='D')
Periods and Period Arithmetic
6),
pd.Series(np.random.standard_normal( )
0 0.434382
1 1.383303
2 0.975721
3 -1.377090
4 0.659404
5 1.098894
dtype: float64
= ["2001Q3", "20022Q2", '20033Q1'] values
= pd.period_range("2000-01-01", "2000-06-30",
periods ="M") freq
= pd.PeriodIndex(values, freq="Q-Dec") index
= pd.Period("2011", freq= "A-Jun") p
p
Period('2011', 'A-JUN')
"M", how='start') p.asfreq(
Period('2010-07', 'M')
"M", how="end") p.asfreq(
Period('2011-06', 'M')
# period index
= pd.period_range("2006", "2009",
periods ="A-Dec") freq
= pd.Series(np.random.standard_normal
tslen(periods)), index= periods) (
ts
2006 1.079865
2007 -1.891582
2008 -0.634198
2009 0.155782
Freq: A-DEC, dtype: float64
"M", how='start') ts.asfreq(
2006-01 1.079865
2007-01 -1.891582
2008-01 -0.634198
2009-01 0.155782
Freq: M, dtype: float64
Converting timestamps to periods (and back)
= pd.date_range("2000-01-01",
dates =3, freq="M") periods
= pd.Series(np.random.standard_normal(3),
ts=dates) index
ts
2000-01-31 -1.472182
2000-02-29 0.042816
2000-03-31 1.232869
Freq: M, dtype: float64
= ts.to_period() pts
pts
2000-01 -1.472182
2000-02 0.042816
2000-03 1.232869
Freq: M, dtype: float64
= pd.date_range("2022-01-29", periods=6) dates
= pd.Series(np.random.standard_normal(6),
ts2 = dates) index
ts2
2022-01-29 -0.865524
2022-01-30 1.518387
2022-01-31 0.327414
2022-02-01 0.380410
2022-02-02 -0.984295
2022-02-03 -2.798704
Freq: D, dtype: float64
ts2.to_period
<bound method Series.to_period of 2022-01-29 -0.865524
2022-01-30 1.518387
2022-01-31 0.327414
2022-02-01 0.380410
2022-02-02 -0.984295
2022-02-03 -2.798704
Freq: D, dtype: float64>
= ts2.to_period() pts
pts
2022-01-29 -0.865524
2022-01-30 1.518387
2022-01-31 0.327414
2022-02-01 0.380410
2022-02-02 -0.984295
2022-02-03 -2.798704
Freq: D, dtype: float64
='end') pts.to_timestamp(how
2022-01-29 23:59:59.999999999 -0.865524
2022-01-30 23:59:59.999999999 1.518387
2022-01-31 23:59:59.999999999 0.327414
2022-02-01 23:59:59.999999999 0.380410
2022-02-02 23:59:59.999999999 -0.984295
2022-02-03 23:59:59.999999999 -2.798704
Freq: D, dtype: float64
Reshaping and sample frequency conversion
= pd.date_range("2022-12-10", periods=100) dates
= pd.Series(np.random.standard_normal(len(dates)),
ts=dates) index
ts
2022-12-10 0.655061
2022-12-11 -2.144779
2022-12-12 0.489381
2022-12-13 -0.117629
2022-12-14 0.782097
...
2023-03-15 -1.263925
2023-03-16 -0.317576
2023-03-17 0.398997
2023-03-18 -0.237727
2023-03-19 -1.407480
Freq: D, Length: 100, dtype: float64
"M").mean() ts.resample(
2022-12-31 -0.177024
2023-01-31 0.218224
2023-02-28 -0.040488
2023-03-31 -0.288735
Freq: M, dtype: float64
"M", kind="period").mean() ts.resample(
2022-12 -0.177024
2023-01 0.218224
2023-02 -0.040488
2023-03 -0.288735
Freq: M, dtype: float64
downsampling
= pd.date_range("2022-01-01",
dates =12, freq="T") periods
= pd.Series(np.arange(len(dates)), index=dates) ts
ts
2022-01-01 00:00:00 0
2022-01-01 00:01:00 1
2022-01-01 00:02:00 2
2022-01-01 00:03:00 3
2022-01-01 00:04:00 4
2022-01-01 00:05:00 5
2022-01-01 00:06:00 6
2022-01-01 00:07:00 7
2022-01-01 00:08:00 8
2022-01-01 00:09:00 9
2022-01-01 00:10:00 10
2022-01-01 00:11:00 11
Freq: T, dtype: int32
"5min").sum() ts.resample(
2022-01-01 00:00:00 10
2022-01-01 00:05:00 35
2022-01-01 00:10:00 21
Freq: 5T, dtype: int32
"5min", closed="right",
ts.resample(="right").sum() label
2022-01-01 00:00:00 0
2022-01-01 00:05:00 15
2022-01-01 00:10:00 40
2022-01-01 00:15:00 11
Freq: 5T, dtype: int32
open-high-low-close(OHLC) resampling
= pd.Series(np.random.permutation
ts len(dates))), index=dates) (np.arange(
"5min").ohlc() ts.resample(
open | high | low | close | |
---|---|---|---|---|
2022-01-01 00:00:00 | 1 | 11 | 1 | 5 |
2022-01-01 00:05:00 | 3 | 7 | 0 | 0 |
2022-01-01 00:10:00 | 8 | 10 | 8 | 10 |
upsampling and interpolation
= pd.DataFrame(np.random.standard_normal((2,5)),
frame =pd.date_range("2022-01-01",
index=2, freq="W-WED"),
periods= ["fdk", 'sadik', 'golewala','pipli','mudki']) columns
frame
fdk | sadik | golewala | pipli | mudki | |
---|---|---|---|---|---|
2022-01-05 | -0.500335 | -0.286433 | 0.805294 | 0.557015 | -1.293101 |
2022-01-12 | 0.470091 | -0.574010 | -0.817633 | 0.197509 | 0.189306 |
for n in range(10):
print(n)
#list comprehension
**2 + 2 for n in range(5)] [n
[2, 3, 6, 11, 18]
# for loop
for n in [n**2 + 2 for n in range(5)]:
print(n)
2
3
6
11
18
= frame.resample('D').asfreq() df_daily
df_daily
fdk | sadik | golewala | pipli | mudki | |
---|---|---|---|---|---|
2022-01-05 | -0.500335 | -0.286433 | 0.805294 | 0.557015 | -1.293101 |
2022-01-06 | NaN | NaN | NaN | NaN | NaN |
2022-01-07 | NaN | NaN | NaN | NaN | NaN |
2022-01-08 | NaN | NaN | NaN | NaN | NaN |
2022-01-09 | NaN | NaN | NaN | NaN | NaN |
2022-01-10 | NaN | NaN | NaN | NaN | NaN |
2022-01-11 | NaN | NaN | NaN | NaN | NaN |
2022-01-12 | 0.470091 | -0.574010 | -0.817633 | 0.197509 | 0.189306 |
# filling certain number of periods
"D").ffill(limit=2) frame.resample(
fdk | sadik | golewala | pipli | mudki | |
---|---|---|---|---|---|
2022-01-05 | -0.500335 | -0.286433 | 0.805294 | 0.557015 | -1.293101 |
2022-01-06 | -0.500335 | -0.286433 | 0.805294 | 0.557015 | -1.293101 |
2022-01-07 | -0.500335 | -0.286433 | 0.805294 | 0.557015 | -1.293101 |
2022-01-08 | NaN | NaN | NaN | NaN | NaN |
2022-01-09 | NaN | NaN | NaN | NaN | NaN |
2022-01-10 | NaN | NaN | NaN | NaN | NaN |
2022-01-11 | NaN | NaN | NaN | NaN | NaN |
2022-01-12 | 0.470091 | -0.574010 | -0.817633 | 0.197509 | 0.189306 |
"W-THU").ffill() frame.resample(
fdk | sadik | golewala | pipli | mudki | |
---|---|---|---|---|---|
2022-01-06 | -0.500335 | -0.286433 | 0.805294 | 0.557015 | -1.293101 |
2022-01-13 | 0.470091 | -0.574010 | -0.817633 | 0.197509 | 0.189306 |
Resampling with periods
= pd.DataFrame(np.random.standard_normal((24,4)),
frame2 = pd.period_range("1-2000","12-2001",
index ="M"),
freq=['fzp', 'fdk', 'btd', 'mudki']) columns
frame2.head()
fzp | fdk | btd | mudki | |
---|---|---|---|---|
2000-01 | -0.438864 | 0.122812 | -1.157890 | 0.403075 |
2000-02 | -0.688525 | -0.721772 | 0.011549 | -0.979600 |
2000-03 | 0.607322 | -1.119676 | 0.089442 | 0.060338 |
2000-04 | 0.690526 | -0.095600 | 0.212831 | 0.410823 |
2000-05 | -1.016129 | -0.430221 | -1.235741 | 0.250289 |
= frame2.resample("A-DEC").mean() annual_frame
annual_frame
fzp | fdk | btd | mudki | |
---|---|---|---|---|
2000 | -0.094852 | -0.436623 | -0.600286 | -0.715234 |
2001 | 0.025365 | -0.347140 | 0.035530 | -0.521633 |
# Q-DEC:Quarterly, year ending in Dec
"Q-DEC").ffill() annual_frame.resample(
fzp | fdk | btd | mudki | |
---|---|---|---|---|
2000Q1 | -0.094852 | -0.436623 | -0.600286 | -0.715234 |
2000Q2 | -0.094852 | -0.436623 | -0.600286 | -0.715234 |
2000Q3 | -0.094852 | -0.436623 | -0.600286 | -0.715234 |
2000Q4 | -0.094852 | -0.436623 | -0.600286 | -0.715234 |
2001Q1 | 0.025365 | -0.347140 | 0.035530 | -0.521633 |
2001Q2 | 0.025365 | -0.347140 | 0.035530 | -0.521633 |
2001Q3 | 0.025365 | -0.347140 | 0.035530 | -0.521633 |
2001Q4 | 0.025365 | -0.347140 | 0.035530 | -0.521633 |
"Q-Dec", convention="end").asfreq() annual_frame.resample(
fzp | fdk | btd | mudki | |
---|---|---|---|---|
2000Q4 | -0.094852 | -0.436623 | -0.600286 | -0.715234 |
2001Q1 | NaN | NaN | NaN | NaN |
2001Q2 | NaN | NaN | NaN | NaN |
2001Q3 | NaN | NaN | NaN | NaN |
2001Q4 | 0.025365 | -0.347140 | 0.035530 | -0.521633 |
"Q-MAR").ffill() annual_frame.resample(
fzp | fdk | btd | mudki | |
---|---|---|---|---|
2000Q4 | -0.094852 | -0.436623 | -0.600286 | -0.715234 |
2001Q1 | -0.094852 | -0.436623 | -0.600286 | -0.715234 |
2001Q2 | -0.094852 | -0.436623 | -0.600286 | -0.715234 |
2001Q3 | -0.094852 | -0.436623 | -0.600286 | -0.715234 |
2001Q4 | 0.025365 | -0.347140 | 0.035530 | -0.521633 |
2002Q1 | 0.025365 | -0.347140 | 0.035530 | -0.521633 |
2002Q2 | 0.025365 | -0.347140 | 0.035530 | -0.521633 |
2002Q3 | 0.025365 | -0.347140 | 0.035530 | -0.521633 |
Grouped time sampling
= 15
N = pd.date_range("2024-02-29 00:00", freq="1min",
times =N)
periods= pd.DataFrame({'time':times,
df "value":np.arange(N)})
df
time | value | |
---|---|---|
0 | 2024-02-29 00:00:00 | 0 |
1 | 2024-02-29 00:01:00 | 1 |
2 | 2024-02-29 00:02:00 | 2 |
3 | 2024-02-29 00:03:00 | 3 |
4 | 2024-02-29 00:04:00 | 4 |
5 | 2024-02-29 00:05:00 | 5 |
6 | 2024-02-29 00:06:00 | 6 |
7 | 2024-02-29 00:07:00 | 7 |
8 | 2024-02-29 00:08:00 | 8 |
9 | 2024-02-29 00:09:00 | 9 |
10 | 2024-02-29 00:10:00 | 10 |
11 | 2024-02-29 00:11:00 | 11 |
12 | 2024-02-29 00:12:00 | 12 |
13 | 2024-02-29 00:13:00 | 13 |
14 | 2024-02-29 00:14:00 | 14 |
# resmpling for 5 min count
= pd.date_range("2024-02-29 00:00", freq="5min",
times2 =N)
periods= pd.DataFrame({'time':times2,
df2 "value":np.arange(N)})
df2
time | value | |
---|---|---|
0 | 2024-02-29 00:00:00 | 0 |
1 | 2024-02-29 00:05:00 | 1 |
2 | 2024-02-29 00:10:00 | 2 |
3 | 2024-02-29 00:15:00 | 3 |
4 | 2024-02-29 00:20:00 | 4 |
5 | 2024-02-29 00:25:00 | 5 |
6 | 2024-02-29 00:30:00 | 6 |
7 | 2024-02-29 00:35:00 | 7 |
8 | 2024-02-29 00:40:00 | 8 |
9 | 2024-02-29 00:45:00 | 9 |
10 | 2024-02-29 00:50:00 | 10 |
11 | 2024-02-29 00:55:00 | 11 |
12 | 2024-02-29 01:00:00 | 12 |
13 | 2024-02-29 01:05:00 | 13 |
14 | 2024-02-29 01:10:00 | 14 |
# simply resampling
'time').resample("5min").count() df.set_index(
value | |
---|---|
time | |
2024-02-29 00:00:00 | 5 |
2024-02-29 00:05:00 | 5 |
2024-02-29 00:10:00 | 5 |
# DataFrame with multiple time series,
= pd.DataFrame({"time":times.repeat(3),
df3 "key":np.tile(["a","b","c"],N),
'value':np.arange(N*3.)})
df3.head()
time | key | value | |
---|---|---|---|
0 | 2024-02-29 00:00:00 | a | 0.0 |
1 | 2024-02-29 00:00:00 | b | 1.0 |
2 | 2024-02-29 00:00:00 | c | 2.0 |
3 | 2024-02-29 00:01:00 | a | 3.0 |
4 | 2024-02-29 00:01:00 | b | 4.0 |
# pandas grouper object
= pd.Grouper(freq="5min") time_key
= (df3.set_index("time")
resampled "key", time_key])
.groupby([sum()) .
resampled
value | ||
---|---|---|
key | time | |
a | 2024-02-29 00:00:00 | 30.0 |
2024-02-29 00:05:00 | 105.0 | |
2024-02-29 00:10:00 | 180.0 | |
b | 2024-02-29 00:00:00 | 35.0 |
2024-02-29 00:05:00 | 110.0 | |
2024-02-29 00:10:00 | 185.0 | |
c | 2024-02-29 00:00:00 | 40.0 |
2024-02-29 00:05:00 | 115.0 | |
2024-02-29 00:10:00 | 190.0 |
resampled.reset_index()
key | time | value | |
---|---|---|---|
0 | a | 2024-02-29 00:00:00 | 30.0 |
1 | a | 2024-02-29 00:05:00 | 105.0 |
2 | a | 2024-02-29 00:10:00 | 180.0 |
3 | b | 2024-02-29 00:00:00 | 35.0 |
4 | b | 2024-02-29 00:05:00 | 110.0 |
5 | b | 2024-02-29 00:10:00 | 185.0 |
6 | c | 2024-02-29 00:00:00 | 40.0 |
7 | c | 2024-02-29 00:05:00 | 115.0 |
8 | c | 2024-02-29 00:10:00 | 190.0 |
Moving window functions
- used for noisy or gappy data
- exclude automatically missing data
= pd.read_csv("E:\pythonfordatanalysis\semainedu26fevrier\iris.csv") data
data.columns
Index(['Id', 'Sepal Length (cm)', 'Sepal Width (cm)', 'Petal Length (cm)',
'Petal Width (cm)', 'Species'],
dtype='object')
'Sepal Width (cm)'].plot() data[
<Axes: >
"Sepal Width (cm)"].rolling(5).mean().plot() data[
<Axes: >