import pandas as pd
import numpy as np
import patsy
import statsmodels
Interfacing between pandas and model code
Creating model descriptions with patsy
- Data Transformations in Pasty Formulas
- Categorical data and patsy
Introduction to statsmodels
- Estimating linear models
- Estimating time series processes
Introduction to scitkit-learn
Interface (data loading and cleaning beforing model building)
= pd.DataFrame({
data "x" :[1, 2, 3, 4, 5],
"y" :[0.1, .2, 0.4, .6, .7],
"z" :[-1, -3, -.45, -5.6, 4]
})
data
data.to_numpy()
# working with DataFrames
= pd.DataFrame(data.to_numpy(),
df2 = ['one', 'two', 'three']) columns
df2
= data.copy() df3
'strings'] = ['a', 'b', 'c', 'd', 'e']
df3[
df3
df3.to_numpy()
# to use subset of columns, loc indexing with to_numpy
= ["x", "y"]
model_cols
data.loc[:, model_cols].to_numpy()
'category'] =pd.Categorical(['a', 'b', 'a','a', 'b'],
data[= ['a', 'b'])
categories
data
# replacing category with dummy variables
= pd.get_dummies(data.category, prefix='category')
dummies
= data.drop('category', axis=1).join(dummies)
data_with_dummies
data_with_dummies
Creating model descriptions with Patsy
data
'category', axis=1) data.drop(
= patsy.dmatrices('z ~ x + y', data) a, b
a
DesignMatrix with shape (5, 1)
z
-1.00
-3.00
-0.45
-5.60
4.00
Terms:
'z' (column 0)
b
DesignMatrix with shape (5, 3)
Intercept x y
1 1 0.1
1 2 0.2
1 3 0.4
1 4 0.6
1 5 0.7
Terms:
'Intercept' (column 0)
'x' (column 1)
'y' (column 2)
np.asarray(a)
np.asarray(b)
# adding +o to suppress the intercept
'z ~ x + y + 0', data)[1] patsy.dmatrices(
'z ~ x + y + 0', data) patsy.dmatrices(
passing pasty objects directly into algorithms
-eg. numpy.linalg.lstsq
= -1
rcond = np.linalg.lstsq(a, b) coef, resid, _, _
C:\Users\Khurana_Kunal\AppData\Local\Temp\ipykernel_23924\735709127.py:2: FutureWarning: `rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.
To use the future default and silence this warning we advise to pass `rcond=None`, to keep using the old, explicitly pass `rcond=-1`.
coef, resid, _, _ = np.linalg.lstsq(a, b)
coef
array([[-0.10510315, -0.18675353, -0.02501629]])
= pd.Series(coef.squeeze(), index=a.design_info.column_names) coef
Data Transformations in Pastry Formulas
= patsy.dmatrices('x ~ y + np.log(np.abs(x) + 1)', data) a, b
b
DesignMatrix with shape (5, 3)
Intercept y np.log(np.abs(x) + 1)
1 0.1 0.69315
1 0.2 1.09861
1 0.4 1.38629
1 0.6 1.60944
1 0.7 1.79176
Terms:
'Intercept' (column 0)
'y' (column 1)
'np.log(np.abs(x) + 1)' (column 2)
= patsy.dmatrices("y ~ standardize(x) + center(z)", data) a,b
a
DesignMatrix with shape (5, 1)
y
0.1
0.2
0.4
0.6
0.7
Terms:
'y' (column 0)
b
DesignMatrix with shape (5, 3)
Intercept standardize(x) center(z)
1 -1.41421 0.21
1 -0.70711 -1.79
1 0.00000 0.76
1 0.70711 -4.39
1 1.41421 5.21
Terms:
'Intercept' (column 0)
'standardize(x)' (column 1)
'center(z)' (column 2)
Categorical Data and Patsy
= pd.DataFrame({
data2 'key1' : ['a', 'b', 'c', 'a', 'c', 'b'],
'key2' : [0, 1, 0, 1, 0, 1],
'v1' : [1, 2, 3, 4, 5, 6],
'v2' : [-1, 0, -1.5, 4.0, 2.5, -1.7]
})
= patsy.dmatrices('v2 ~ key1', data2) y, X
X
DesignMatrix with shape (6, 3)
Intercept key1[T.b] key1[T.c]
1 0 0
1 1 0
1 0 1
1 0 0
1 0 1
1 1 0
Terms:
'Intercept' (column 0)
'key1' (columns 1:3)
= patsy.dmatrices('v2 ~ C(key2)', data2) y, X
X
DesignMatrix with shape (6, 2)
Intercept C(key2)[T.1]
1 0
1 1
1 0
1 1
1 0
1 1
Terms:
'Intercept' (column 0)
'C(key2)' (column 1)
'key2'] = data2['key2'].map({0: 'zero', 1:'one'}) data2[
data2
key1 | key2 | v1 | v2 | |
---|---|---|---|---|
0 | a | zero | 1 | -1.0 |
1 | b | one | 2 | 0.0 |
2 | c | zero | 3 | -1.5 |
3 | a | one | 4 | 4.0 |
4 | c | zero | 5 | 2.5 |
5 | b | one | 6 | -1.7 |
= patsy.dmatrices('v2 ~ key1 + key2', data2) y, X
X
DesignMatrix with shape (6, 4)
Intercept key1[T.b] key1[T.c] key2[T.zero]
1 0 0 1
1 1 0 0
1 0 1 1
1 0 0 0
1 0 1 1
1 1 0 0
Terms:
'Intercept' (column 0)
'key1' (columns 1:3)
'key2' (column 3)
= patsy.dmatrices('v2 ~ key1 + key2 + key1:key2', data2) y, X
X
DesignMatrix with shape (6, 6)
Columns:
['Intercept',
'key1[T.b]',
'key1[T.c]',
'key2[T.zero]',
'key1[T.b]:key2[T.zero]',
'key1[T.c]:key2[T.zero]']
Terms:
'Intercept' (column 0)
'key1' (columns 1:3)
'key2' (column 3)
'key1:key2' (columns 4:6)
(to view full data, use np.asarray(this_obj))
Statsmodles
- linear models, generalized linear models, and robust linear models
- linear mixed effects models
- ANOVA
- time series and state space models
- generalized methods of moments
Estimating linear models
import statsmodels.api as sm
import statsmodels.formula.api as smf
# generating a linear model from a random data
= np.random.default_rng(seed = 12345)
rng
def dnorm(mean, variance, size=1):
if isinstance (size, int):
= size,
size return mean+ np.sqrt(variance) * rng.standard_normal(*size)
= 100
N = np.c_[dnorm(0, 0.4, size=N),
X 0, 0.6, size=N),
dnorm(0, 0.2, size= N)]
dnorm(= dnorm(0, 0.1, size=N)
eps = [0.1, 0.3, 0.5]
beta
= np.dot(X, beta) + eps y
5] X[:
array([[-0.90050602, -0.18942958, -1.0278702 ],
[ 0.79925205, -1.54598388, -0.32739708],
[-0.55065483, -0.12025429, 0.32935899],
[-0.16391555, 0.82403985, 0.20827485],
[-0.04765129, -0.21314698, -0.04824364]])
5] y[:
array([-0.59952668, -0.58845445, 0.18563386, -0.00747657, -0.01537445])
# fitting a linear model with intercept term
= sm.add_constant(X)
X_model
5] X_model[:
array([[ 1. , -0.90050602, -0.18942958, -1.0278702 ],
[ 1. , 0.79925205, -1.54598388, -0.32739708],
[ 1. , -0.55065483, -0.12025429, 0.32935899],
[ 1. , -0.16391555, 0.82403985, 0.20827485],
[ 1. , -0.04765129, -0.21314698, -0.04824364]])
# filling least squre linear regression with sm.OLS
= sm.OLS(y, X) model
= model.fit() results
results.params
array([0.06681503, 0.26803235, 0.45052319])
# printing summary
print(results.summary())
OLS Regression Results
=======================================================================================
Dep. Variable: y R-squared (uncentered): 0.469
Model: OLS Adj. R-squared (uncentered): 0.452
Method: Least Squares F-statistic: 28.51
Date: Fri, 01 Mar 2024 Prob (F-statistic): 2.66e-13
Time: 12:55:52 Log-Likelihood: -25.611
No. Observations: 100 AIC: 57.22
Df Residuals: 97 BIC: 65.04
Df Model: 3
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
x1 0.0668 0.054 1.243 0.217 -0.040 0.174
x2 0.2680 0.042 6.313 0.000 0.184 0.352
x3 0.4505 0.068 6.605 0.000 0.315 0.586
==============================================================================
Omnibus: 0.435 Durbin-Watson: 1.869
Prob(Omnibus): 0.805 Jarque-Bera (JB): 0.301
Skew: 0.134 Prob(JB): 0.860
Kurtosis: 2.995 Cond. No. 1.64
==============================================================================
Notes:
[1] R² is computed without centering (uncentered) since the model does not contain a constant.
[2] Standard Errors assume that the covariance matrix of the errors is correctly specified.
= pd.DataFrame(X, columns=['col0', 'col1', 'col2'])
data
'y']=y
data[5] data[:
col0 | col1 | col2 | y | |
---|---|---|---|---|
0 | -0.900506 | -0.189430 | -1.027870 | -0.599527 |
1 | 0.799252 | -1.545984 | -0.327397 | -0.588454 |
2 | -0.550655 | -0.120254 | 0.329359 | 0.185634 |
3 | -0.163916 | 0.824040 | 0.208275 | -0.007477 |
4 | -0.047651 | -0.213147 | -0.048244 | -0.015374 |
# using statsmodles formula API and Pastry formuls
= smf.ols('y ~ col0 + col1 + col2', data=data).fit() results
results.params
Intercept -0.020799
col0 0.065813
col1 0.268970
col2 0.449419
dtype: float64
results.tvalues
Intercept -0.652501
col0 1.219768
col1 6.312369
col2 6.567428
dtype: float64
# computing predicted values
5]) results.predict(data[:
0 -0.592959
1 -0.531160
2 0.058636
3 0.283658
4 -0.102947
dtype: float64
Estimating time series processes
= 4
init_x = [init_x, init_x]
values = 1000
N = 0.8
b0 = -0.4
b1 = dnorm(0, 0.1, N)
noise for i in range(N):
= values[-1] * b0 + values[-2] +b1 + noise[i]
new_x values.append(new_x)
from statsmodels.tsa.ar_model import AutoReg
= 5
MAXLAGS
= AutoReg(values, MAXLAGS)
model
= model.fit() results
C:\Users\Khurana_Kunal\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\statsmodels\base\model.py:1529: RuntimeWarning: invalid value encountered in multiply
cov_p = self.normalized_cov_params * scale
results.params
array([0. , 0.81652213, 0.5528124 , 0.37427221, 0.25339463,
0.17155651])
Scikit-learn
! pip install scikit-learn
= pd.read_csv(r"E:\pythonfordatanalysis\semainedu26fevrier\train (1).csv") train
= pd.read_csv(r"E:\pythonfordatanalysis\semainedu26fevrier\test (1).csv") test
train.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
# looking for missing data
sum() train.isna().
PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 177
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 687
Embarked 2
dtype: int64
sum() test.isna().
PassengerId 0
Pclass 0
Name 0
Sex 0
Age 86
SibSp 0
Parch 0
Ticket 0
Fare 1
Cabin 327
Embarked 0
dtype: int64
# using 'Age' as a predictor
# using median of training set to fill missing values
= train['Age'].median()
impute_value
'Age'] = train['Age'].fillna(impute_value)
train[
'Age'] = test['Age'].fillna(impute_value) test[
# specying the models
'isFemale'] = (train['Sex'] == 'female').astype(int)
train[
'isFemale'] = (test['Sex'] == 'female').astype(int) test[
# creating NumPy arrays and deciding on some model variables
= ['Pclass', 'isFemale', 'Age']
predictors
= train[predictors].to_numpy()
X_train
= test[predictors].to_numpy()
X_test
= train['Survived'].to_numpy()
y_train
5] X_train[:
array([[ 3., 0., 22.],
[ 1., 1., 38.],
[ 3., 1., 26.],
[ 1., 1., 35.],
[ 3., 0., 35.]])
5] y_train[:
array([0, 1, 1, 1, 0], dtype=int64)
from sklearn.linear_model import LogisticRegression
= LogisticRegression() model
model.fit(X_train, y_train) LogisticRegression()
LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression()
= model.predict(X_test) y_predict
10] y_predict[:
array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0], dtype=int64)
== y_predict).mean() (y_true
Theory
many models have parameters that can be tuned to avoid overfitting
example- Cross-validataion (longer to train, but performs better)
from sklearn.linear_model import LogisticRegressionCV
= LogisticRegressionCV(Cs=10)
model_cv
model_cv.fit(X_train, y_train)
LogisticRegressionCV()
LogisticRegressionCV()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegressionCV()
from sklearn.model_selection import cross_val_score
= LogisticRegression (C=10)
model
= cross_val_score(model, X_train, y_train, cv=4)
scores
scores
array([0.77578475, 0.79820628, 0.77578475, 0.78828829])