# READING CREDSCO.TXT dd <- read.table("C:/Users/aluja/Documents/txt/docent/Curs MD english/R/credsco.txt",header=T) dim(dd) names(dd) # FIRST CONTROL: MAXIMS, MINIMUMS, ... (FINDING ERRORS AND OUTLIERS) # WHICH RESPONSE? # WHICH ARE CATEGORICAL AND WHICH ARE CONTINUOUS # ARE THERE MISSINGS? summary(dd) # DEALING WITH MISSINGS: ELIMINATE FROM DICTAMENT MANCANT, VIVENDA MANCANT, ESTAT CIVIL MANCANT I TIPUS DE FEINA MANCANT (VALORS MANCANTS DE LES VARAIBLES CATEGORIQUES, son pocs) table(dd[,1]==0) table(dd[,3]==0) table(dd[,6]==0) table(dd[,8]==0) dd <- dd[dd[,1] != 0 & dd[,3] != 0 & dd[,6] != 0 & dd[,8] != 0,] dim(dd) # LOOK FOR MISSING VALUES AMONG THE CONTINUOUS VARIABLES (99999999 MISSING CODE) # LETS WORK WITH DIRECTLY WITH THE VARIABLES OF THE DATA FRAME objects() attach(dd) objects() table(Ingressos == 99999999) table(Patrimoni == 99999999) table(Carrecs_pat == 99999999) table(Antig_feina == 0) table(Ingressos == 0) # LES VAS. CONTINUES TENEN MASSES VALORS MANCANTS, NO ELS PODEM ELIMINAR COM SI RES # HEM DE FER UN TRACTAMENT DE LES DADES MANCANTS # PASSEM ELS VALORS MANCANTS A NA, INCLOSOS ELS 0 DELS INGRESSOS Ingressos[Ingressos == 99999999 | Ingressos == 0] <- NA Patrimoni[Patrimoni == 99999999] <- NA Carrecs_pat[Carrecs_pat == 99999999] <- NA # WARNING: NOW dd[,10] Ingressos DOESNT HAVE THE SAME CONTENT summary(dd[,10]) summary(Ingressos) # IMPUTATION By THE 1NN library(class) # FOR EVERY INDIVIDUAL WITH INGRESSOS MISSING WE LOOK FOR THE MOST SIMILAR INDIVIDUAL ACCORDING THE REMAINING VARIABLES AND WE COPY THE VALUE OF INGRESSOS ON THE FIRST aux = dd[,-10] dim(aux) aux1 = aux[!is.na(Ingressos),] dim(aux1) aux2 = aux[is.na(Ingressos),] dim(aux2) knn.ing = knn(aux1,aux2,Ingressos[!is.na(Ingressos)]) # NEITHER AUX1, AUX2 CAN CONTAIN NAs) Ingressos[is.na(Ingressos)] = knn.ing # IMPUTATION OF PATRIMONI aux = dd[,-11] aux1 = aux[!is.na(Patrimoni),] aux2 = aux[is.na(Patrimoni),] knn.pat = knn(aux1,aux2,Patrimoni[!is.na(Patrimoni)]) Patrimoni[is.na(Patrimoni)] = knn.pat # IMPUTATION OF CARRECS PATRIMONIALS aux = dd[,-12] aux1 = aux[!is.na(Carrecs_pat),] aux2 = aux[is.na(Carrecs_pat),] knn.car = knn(aux1,aux2, Carrecs_pat[!is.na(Carrecs_pat)]) Carrecs_pat[is.na(Carrecs_pat)] = knn.car Carrecs_pat[Patrimoni==0] <- 0 dd[,10] <- Ingressos dd[,11] <- Patrimoni dd[,12] <- Carrecs_pat # VERIFY AGAIN MAX AND MIN dim(dd) summary(dd) # DECLARE CATEGORICAL Dictamen <- as.factor(Dictamen) Vivenda <- as.factor(Vivenda) Estat_civil <- as.factor(Estat_civil) Registres <- as.factor(Registres) Tipus_feina <- as.factor(Tipus_feina) levels(Dictamen) levels(Vivenda) levels(Estat_civil) levels(Registres) levels(Tipus_feina) levels(Dictamen) <- c("positiu","negatiu") levels(Vivenda) <- c("lloguer","escriptura","contr_privat","ignora_cont","pares","altres viv") levels(Estat_civil) <- c("solter","casat","vidu","separat","divorciat") levels(Registres) <- c("reg_no","reg_si") levels(Tipus_feina) <- c("fixe","temporal","autonom","altres sit") # WARNING is.factor(Dictamen) is.factor(dd[,1]) # DERIVATION OF NEW VARIABLES: “FEATURE EXTRACTION” # RATIO OF FINANCEMENT Rati_fin = 100*Import_sol/Preu_finan hist(Rati_fin) # CAPACITY TO SAVE Estalvi <- (Ingressos-Despeses-(Carrecs_pat/100))/(Import_sol/Plaç) hist(Estalvi) # RECODING THE CONTINUOUS VARIABLES TO CATEGORICAL (TO ASSESS NON LINEARITIES) antigR <- cut(Antig_feina, breaks=c(-1,1,3,8,14,99)) plaçR <- cut(Plaç, breaks=c(0,12,24,36,48,99)) edatR <- cut(Edat, breaks=c(0,25,30,40,50,99)) despesesR <- cut(Despeses, breaks=c(0,40,50,60,80,9999)) ingressosR <- cut(Ingressos, breaks=c(0,80,110,140,190,9999)) patrimoniR <- cut(Patrimoni, breaks=c(-1,0,3000,5000,8000,999999)) carrecsR <- cut(Carrecs_pat, breaks=c(-1,0,500,1500,2500,999999)) importR <- cut(Import_sol, breaks=c(0,600,900,1100,1400,99999)) preuR <- cut(Preu_finan, breaks=c(0,1000,1300,1500,1800,99999)) ratfinR <- cut(Rati_fin, breaks=c(0,50,70,80,90,100)) estalviR <- cut(Estalvi, breaks=c(-99,0,2,4,6,99)) levels(antigR) <- paste("Antig",levels(antigR)) levels(plaçR) <- paste("Plaç",levels(plaçR)) levels(edatR) <- paste("Edat",levels(edatR)) levels(despesesR) <- paste("Desp",levels(despesesR)) levels(ingressosR) <- paste("Ingr",levels(ingressosR)) levels(patrimoniR) <- paste("Patr",levels(patrimoniR)) levels(carrecsR) <- paste("Carr",levels(carrecsR)) levels(importR) <- paste("Import",levels(importR)) levels(preuR) <- paste("Preu",levels(preuR)) levels(ratfinR) <- paste("Ratfin",levels(ratfinR)) levels(estalviR) <- paste("Estalv",levels(estalviR)) # # FEATURE SELECTION: FOR CONTINUOUS VARIABLES FISFER's F # RESPONSE VARIABLE: DICTAMEN # pvalcon <- NULL varc <- list(Antig_feina,Plaç,Edat,Despeses,Ingressos,Patrimoni,Carrecs_pat,Import_sol,Preu_finan,Rati_fin,Estalvi) for (i in 1:11) { pvalcon[i] <- (oneway.test(varc[[i]]~Dictamen))$p.value } pvalcon = matrix(pvalcon) row.names(pvalcon) = c("Antig_feina","Plaç","Edat","Despeses","Ingressos","Patrimoni","Carrecs_pat","Import_sol","Preu_finan","Rati_fin","Estalvi") # ORDERED LIST OF CONTINUOUS VARIABLES ACCORDING THEIR DEPENDENCE OF Dictamen sort(pvalcon[,1]) # # FEATURE SELECTION: FOR CATEGORICAL VARIABLES CHI-SQUARE # RESPONSE VARIABLE: DICTAMEN # pvalcat <- NULL vark <- list(edatR,antigR,plaçR,despesesR,ingressosR,patrimoniR,carrecsR,importR,preuR,ratfinR,estalviR,Vivenda,Estat_civil,Registres,Tipus_feina) for (i in 1:15) { pvalcat[i] <- (chisq.test(vark[[i]],Dictamen))$p.value } pvalcat = matrix(pvalcat) row.names(pvalcat) = c("edatR","antigR","plaçR","despesesR","ingressosR","patrimoniR","carrecsR","importR","preuR","ratfinR","estalviR","Vivenda","Estat_civil","Registres","Tipus_feina") # ORDERED LIST OF CATEGORICAL VARIABLES ACCORDING THEIR DEPENDENCE OF Dictamen sort(pvalcat[,1]) # # WHICH VARIABLE CAN BE DISCARDED? # # NEVERTHELESS ALL EXPLANOTORY VARIABLES HAVE BEEN CHOSEN BY AN EXPERT (FROM HUNDREDS IN THE DB) ! # # PROFILE OF Dictamen # # # GRAPHICAL REPRESENTATION Dictamen * CONTINUOUS VARIABLES # par(ask=TRUE) ncon <- nrow(pvalcon) for (i in 1:ncon) { barplot(tapply(varc[[i]], Dictamen, mean),main=paste("Means by",row.names(pvalcon)[i])) abline(h=mean(varc[[i]])) legend(0,mean(varc[[i]]),"global mean",bty="n") } # FUNCTION # PVALUE OF THE HYPOTHESIS TEST COMPARING THE MEAN OF THE GROUP WITH THE GLOBAL MEAN # WE DETECT POSITIVE DEVIATIONS ONLY # p.xk <- function(vec,fac){nk <- as.vector(table(fac)); n <- sum(nk); xk <- tapply(vec,fac,mean); txk <- (xk-mean(vec))/(sd(vec)*sqrt((n-nk)/(n*nk))); pxk <- pt(txk,n-1,lower.tail=F)} # # FUNCTION # PVALUE OF THE HYPOTHESIS TEST COMPARING THE PROPORTION OF THE GROUP WITHIN ONE MODALITY WITH THE GLOBAL PROPORTION OF THE GROUP # p.zkj <- function(resp,expl){taula <- table(resp,expl);n <- sum(taula); pk <- apply(taula,1,sum)/n; pj <- apply(taula,2,sum)/n;pf <- taula/(n*pk); pjm <- matrix(data=pj,nrow=dim(pf)[1],ncol=dim(pf)[2], byrow=T); dpf <- pf - pjm; dvt <- sqrt(((1-pk)/(n*pk))%*%t(pj*(1-pj))); zkj <- dpf/dvt; pzkj <- pnorm(zkj,lower.tail=F); list(rowpf=pf,vtest=zkj,pval=pzkj)} nresp <- length(levels(Dictamen)) pvalk.con <- matrix(NA,nresp,ncon) rownames(pvalk.con) <- levels(Dictamen) colnames(pvalk.con) <- row.names(pvalcon) for (i in 1:ncon) { pvalk.con[,i] = p.xk(varc[[i]],Dictamen) } for (k in 1:nresp) { print(paste("P.values of Dictamen:",levels(Dictamen)[k])); print(sort(pvalk.con[k,])) } # # GRAPHICAL REPRESENTATION Dictamen * CATEGORICAL VARIABLES # par(mfrow=c(1,3)) n <- nrow(dd) ncat <- nrow(pvalcat) for (i in 1:ncat) { rowprof <- p.zkj(Dictamen,vark[[i]])$rowpf marg <- table(vark[[i]])/n plot(marg,type="l",ylim=c(0,0.6),main=paste("Prop. of pos & neg by",row.names(pvalcat)[i])) lines(rowprof[1,],col="blue") lines(rowprof[2,],col="red") legend("topright",c("pos","neg"),col=c("blue","red"),lty=1) } par(mfrow=c(1,1)) pvalk.cat = NULL for (i in 1:ncat) { auxpvalk <- p.zkj(Dictamen,vark[[i]])$pval pvalk.cat = cbind(pvalk.cat,auxpvalk) } for (k in 1:nresp) { print(paste("P.values of Dictamen:",levels(Dictamen)[k])); print(sort(pvalk.cat[k,])) } # SAVING THE TRANSFORMATIONS IN A INTERNAL R FILE save.image("C:\\Users\\aluja\\Documents\\txt\\docent\\Curs MD english\\R\\credsco_bin")