Chapitre 2 : modèle linéaire généralisé

Size: px

Start display at page:

Download "Chapitre 2 : modèle linéaire généralisé"

Margery Wilkerson
5 years ago
Views:

1 Chapitre 2 : modèle linéaire généralisé

2 Introduction et jeux de données

3 Avant de commencer Faire pointer R vers votre répertoire setwd("~/dropbox/evry/m1geniomhe/cours/") source(file = "fonction_illustration_logistique.r") ## ## Attaching package: 'dplyr' ## The following objects are masked from 'package:stats': ## ## filter, lag ## The following objects are masked from 'package:base': ## ## intersect, setdiff, setequal, union (Installer et) charger les librairies ISLR, ggplot2,dplyr subsampling # sum(default == "Yes") # sample(which(default == "No"), size = 500, replace =FALSE) # Default_small = slice(default,c(which(default == "Yes"),sample(which(default = # write.table(default_small,file = 'Default_small')

4 les données Default ## [1] ## Observations: 833 ## Variables: 4 ## $ default <fctr> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes... ## $ student <fctr> Yes, Yes, Yes, No, Yes, Yes, No, No, No, No, No, No,... ## $ balance <dbl> , , , , ,... ## $ income <dbl> , , , , ,

5 scatterplot pairs(default, main="simple Scatterplot Matrix") Simple Scatterplot Matrix default student balance income

6 histogram ggplot(default, aes(x=default)) + geom_bar( fill = "steelblue") + theme_bw() count No default Yes

7 pour les variables balance et income ggplot(default, aes(balance, income)) + geom_point(aes(colour = default)) income default No Yes balance

8 ggplot(default, aes(default, balance)) + geom_boxplot(aes(fill = default)) 2000 balance default No Yes No default Yes

9 ggplot(default, aes(default, income)) + geom_boxplot(aes(fill = default)) income default No Yes No default Yes

10 ggplot(default, aes(balance, as.numeric(default)-1)) + geom_point() + geom_smooth(method = "glm", method.args = list(family = "gaussian")) 1.0 as.numeric(default) balance

11 plt = illustration_logistique(default,default$balance,default$default) plt probability of Default$default $(Default, balance)

12 plt = illustration_logistique(default,default$balance,default$default, log_reg = plt probability of Default$default $(Default, balance)

13 Les données crab crab = read.table("crab.txt",header = TRUE) glimpse(crab) ## Observations: 173 ## Variables: 6 ## $ Obs <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,... ## $ C <int> 2, 3, 3, 4, 2, 1, 4, 2, 2, 2, 1, 3, 2, 2, 3, 3, 2, 2, 2, 2... ## $ S <int> 3, 3, 3, 2, 3, 2, 3, 3, 1, 3, 1, 3, 1, 3, 3, 3, 3, 3, 3, 3... ## $ W <dbl> 28.3, 26.0, 25.6, 21.0, 29.0, 25.0, 26.2, 24.9, 25.7, ## $ Wt <dbl> 3.05, 2.60, 2.15, 1.85, 3.00, 2.30, 1.30, 2.10, 2.00, ## $ Sa <int> 8, 4, 0, 0, 1, 3, 0, 0, 8, 6, 5, 4, 3, 4, 3, 5, 8, 3, 6, 4... crab = mutate(crab, C = factor(c)) crab = mutate(crab, S = factor(s)) glimpse(crab) ## Observations: 173 ## Variables: 6 ## $ Obs <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,... ## $ C <fctr> 2, 3, 3, 4, 2, 1, 4, 2, 2, 2, 1, 3, 2, 2, 3, 3, 2, 2, 2,... ## $ S <fctr> 3, 3, 3, 2, 3, 2, 3, 3, 1, 3, 1, 3, 1, 3, 3, 3, 3, 3, 3,... ## $ W <dbl> 28.3, 26.0, 25.6, 21.0, 29.0, 25.0, 26.2, 24.9, 25.7, ## $ Wt <dbl> 3.05, 2.60, 2.15, 1.85, 3.00, 2.30, 1.30, 2.10, 2.00, ## $ Sa <int> 8, 4, 0, 0, 1, 3, 0, 0, 8, 6, 5, 4, 3, 4, 3, 5, 8, 3, 6, 4... attach(crab)

14 scatterplot pairs(crab[-1], main="simple Scatterplot Matrix") Simple Scatterplot Matrix C S W Wt Sa

15 histogram ggplot(crab, aes(x=sa)) + geom_bar( fill = "steelblue") + theme_bw() count Sa

16 pour la variable W plt = illustration_poisson(crab,w,sa) plt mean of Sa W

17 plt = illustration_poisson(crab,w,sa, log_pois = TRUE) plt mean of Sa W

18 Régression logistique

19 Estimation des coefficients fit_logistic = glm(default ~ student + balance + income, family = "binomial", data=default) summary(fit_logistic) ## ## Call: ## glm(formula = default ~ student + balance + income, family = "binomial", ## data = Default) ## ## Deviance Residuals: ## Min 1Q Median 3Q Max ## ## ## Coefficients: ## Estimate Std. Error z value Pr(> z ) ## (Intercept) e e <2e-16 *** ## studentyes e e ## balance 5.330e e <2e-16 *** ## income 1.165e e ## --- ## Signif. codes: 0 '***' '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## (Dispersion parameter for binomial family taken to be 1) ## ## Null deviance: on 832 degrees of freedom ## Residual deviance: on 829 degrees of freedom

20 Prédictions # Prédictions predictions = predict(fit_logistic,type = "response") predictions_01 = predict(fit_logistic,type = "response") > 1/2 # Matrice de confusion table(predictions_01,default) ## default ## predictions_01 No Yes ## FALSE ## TRUE

21 ## ## Attaching package: 'gplots' ## The following object is masked from 'package:stats': ## ## lowess

22 Courbe ROC pred <- prediction( predictions, Default$default) perf <- performance( pred, "tpr", "fpr" )

23 plot( perf ) True positive rate False positive rate

24 AUC ROC_auc <- performance( pred,"auc") AUC <- print(auc) ## [1]

25 Test de nullité simultanée des coefficients fit_null = glm(default ~ 1, family = "binomial", data=default) anova(fit_null,fit_logistic,test="chisq") ## Analysis of Deviance Table ## ## Model 1: default ~ 1 ## Model 2: default ~ student + balance + income ## Resid. Df Resid. Dev Df Deviance Pr(>Chi) ## ## < 2.2e-16 *** ## --- ## Signif. codes: 0 '***' '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

26 Résidus de déviance / observations aberrantes which(abs(resid(fit_logistic,type="deviance"))>2) ## ## ## ##

27 Attention pas de diagnostic gaussien! n = 100 x <-rnorm(n) p<-exp(x)/(1+exp(x)) #simulate response y <- rbinom(n,1,p) #fit model model = glm(y~x,family="binomial") qqnorm(resid(model, type="deviance")) abline(0,1) Normal Q Q Plot Sample Quantiles Theoretical Quantiles

28 Leviers d observations influences = influence(fit_logistic) hat = influences$hat which(hat > 3*ncol(Default) /nrow(default)) ## ## ## ##

29 Diagnostic sur X library(car) ## ## Attaching package: 'car' ## The following object is masked from 'package:dplyr': ## ## recode vif(fit_logistic) ## student balance income ##

30 Régression de Poisson

31 Estimation des coefficients glimpse(crab) ## Observations: 173 ## Variables: 6 ## $ Obs <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,... ## $ C <fctr> 2, 3, 3, 4, 2, 1, 4, 2, 2, 2, 1, 3, 2, 2, 3, 3, 2, 2, 2,... ## $ S <fctr> 3, 3, 3, 2, 3, 2, 3, 3, 1, 3, 1, 3, 1, 3, 3, 3, 3, 3, 3,... ## $ W <dbl> 28.3, 26.0, 25.6, 21.0, 29.0, 25.0, 26.2, 24.9, 25.7, ## $ Wt <dbl> 3.05, 2.60, 2.15, 1.85, 3.00, 2.30, 1.30, 2.10, 2.00, ## $ Sa <int> 8, 4, 0, 0, 1, 3, 0, 0, 8, 6, 5, 4, 3, 4, 3, 5, 8, 3, 6, 4... fit_poisson = glm(sa ~ C + S + W + Wt, family = "poisson", data=crab) summary(fit_poisson) ## ## Call: ## glm(formula = Sa ~ C + S + W + Wt, family = "poisson", data = crab) ## ## Deviance Residuals: ## Min 1Q Median 3Q Max ## ## ## Coefficients: ## Estimate Std. Error z value Pr(> z ) ## (Intercept)

32 Prédictions # Prédictions predictions = predict(fit_poisson,type = "response")

33 Test de nullité simultanée des coefficients fit_null = glm(sa ~ 1, family = "poisson", data=crab) anova(fit_null,fit_poisson,test="chisq") ## Analysis of Deviance Table ## ## Model 1: Sa ~ 1 ## Model 2: Sa ~ C + S + W + Wt ## Resid. Df Resid. Dev Df Deviance Pr(>Chi) ## ## e-15 *** ## --- ## Signif. codes: 0 '***' '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

34 Résidus de déviance / observations aberrantes which(abs(resid(fit_poisson,type="deviance"))>2) ## ## ## ## ## ##

35 Leviers d observations influences = influence(fit_poisson) hat = influences$hat which(hat > 3*(ncol(crab)-1) /nrow(crab)) ## ## ## ##

36 Diagnostic sur X library(car) vif(fit_poisson) ## GVIF Df GVIF^(1/(2*Df)) ## C ## S ## W ## Wt

Discriminant analysis in R QMMA

Discriminant analysis in R QMMA Emanuele Taufer file:///c:/users/emanuele.taufer/google%20drive/2%20corsi/5%20qmma%20-%20mim/0%20labs/l4-lda-eng.html#(1) 1/26 Default data Get the data set Default library(islr)