Carge de bibliotecas

library(neuralnet)  # regression

library(nnet) # classification 

library(NeuralNetTools)

library(plyr)

Carga de Datos

library(readr)
Startups <- read_csv("/home/rpalma/AAA_Datos/2020/Posgrado/Di3/Datasets/50 Start Ups/50_Startups_LAC.csv")
## Rows: 50 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Pais
## dbl (5): R_D_Spend, POM, Logist_Market, Profit, Supervivencia
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Categoric <- read_csv("/home/rpalma/AAA_Datos/2020/Posgrado/Di3/Datasets/50 Start Ups/50_Startups_Categoric_LAC.csv")
## Rows: 50 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Pais, Supervivencia
## dbl (4): R_D_Spend, POM, Logist_Market, Profit
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

La columna supervivencia de la tabla Categoric tiene etiquetas que identifican la situación en la que terminaron los emprendimientos, a saber:

Tratamiento de variables categóricas

tabla1 <- table(Categoric$Pais)
tabla2 <- table(Categoric$Supervivencia)
tabla3 <- table(Categoric$Pais,Categoric$Supervivencia)
plot(tabla1, col=c("red","green","blue"))

plot(tabla2, col=c("red","green","blue"))

plot(tabla3, col=c("red","green","blue"))

Histogramas superpuestos

ind_1 <- which(Categoric$Pais=="Colombia")
p1 <- as.matrix(Categoric[ind_1,5])

ind_2 <- which(Categoric$Pais=="Ecuador")
p2 <- as.matrix(Categoric[ind_2,5])

ind_3 <- which(Categoric$Pais=="Chile")
p3 <- as.matrix(Categoric[ind_3,5])
hp1 <- hist(p1)

hp2 <- hist(p2)

hp3 <- hist(p3)

par(mfrow=c(3,1))
plot( hp1, col=rgb(0,0,1,1/4), xlim=c(30000,200000),ylim=c(0,5),main="Ecuador")
plot( hp2, col=rgb(1,0,0,1/4),xlim=c(30000,200000),ylim=c(0,10),main="Colombia") 
plot( hp3, col=rgb(1,0,0,1/4),xlim=c(30000,200000),ylim=c(0,10),main="Chile") 

par(mfrow=c(1,3))
plot( hp1, col=rgb(0,0,1,1/4), xlim=c(30000,200000),ylim=c(0,5),main="Ecuador")
plot( hp2, col=rgb(1,0,0,1/4),xlim=c(30000,200000),ylim=c(0,10),main="Colombia") 
plot( hp3, col=rgb(1,0,0,1/4),xlim=c(30000,200000),ylim=c(0,10),main="Chile") 

pairs(Categoric[ ,1:3])

boxplot(Categoric[ ,1:3])

Categoric$Pais <- as.numeric(revalue(Categoric$Pais,
                          c("Colombia"="0", "Ecuador"="1",
                            "Chile"="2")))

Cuadro de campos categóricos

Categoric$Supervivencia <- as.numeric(revalue(Categoric$Supervivencia,
                          c("BankR"="0", "RevEq"="1",
                            "SpinOff"="2")))

Profit versus País

plot(Categoric$Pais, Categoric$Profit)

Visualización de Tablas

Tabla Textual

library(kableExtra)
kable(head(Categoric), "pipe")
R_D_Spend POM Logist_Market Pais Profit Supervivencia
165349.2 136897.80 471784.1 0 192261.8 2
162597.7 151377.59 443898.5 1 191792.1 2
153441.5 101145.55 407934.5 2 191050.4 2
144372.4 118671.85 383199.6 0 182902.0 2
142107.3 91391.77 366168.4 2 166187.9 2
131876.9 99814.71 362861.4 0 156991.1 2

Tabla Simple

kable(head(Categoric), "simple")
R_D_Spend POM Logist_Market Pais Profit Supervivencia
165349.2 136897.80 471784.1 0 192261.8 2
162597.7 151377.59 443898.5 1 191792.1 2
153441.5 101145.55 407934.5 2 191050.4 2
144372.4 118671.85 383199.6 0 182902.0 2
142107.3 91391.77 366168.4 2 166187.9 2
131876.9 99814.71 362861.4 0 156991.1 2

Normailización

normalize<-function(x){
  return ( (x-min(x))/(max(x)-min(x)))
}

Startups_norm<-as.data.frame(lapply(Categoric,FUN=normalize))
summary(Startups_norm$Profit)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.4249  0.5254  0.5481  0.7044  1.0000

Datos Originales y Datos normalizados

head(Categoric$Profit)
## [1] 192261.8 191792.1 191050.4 182902.0 166187.9 156991.1
head(Startups_norm)
##   R_D_Spend       POM Logist_Market Pais    Profit Supervivencia
## 1 1.0000000 0.6517439     1.0000000  0.0 1.0000000             1
## 2 0.9833595 0.7619717     0.9408934  0.5 0.9973546             1
## 3 0.9279846 0.3795790     0.8646636  1.0 0.9931781             1
## 4 0.8731364 0.5129984     0.8122351  0.0 0.9472924             1
## 5 0.8594377 0.3053280     0.7761356  1.0 0.8531714             1
## 6 0.7975660 0.3694479     0.7691259  0.0 0.8013818             1

Muestreo para entrenamento

indice <- sample(2, nrow(Startups_norm), replace = TRUE, prob = c(0.7,0.3))
startups_train <- Startups_norm[indice==1,]
startups_test  <- Startups_norm[indice==2,]

Modelo de Neural Net

library(neuralnet)
attach(Categoric)

startups_model <- neuralnet(Profit ~ R_D_Spend+ POM + Logist_Market + Pais , data = startups_train)

str(startups_model)
## List of 14
##  $ call               : language neuralnet(formula = Profit ~ R_D_Spend + POM + Logist_Market + Pais, data = startups_train)
##  $ response           : num [1:38, 1] 1 0.997 0.993 0.947 0.853 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:38] "1" "2" "3" "4" ...
##   .. ..$ : chr "Profit"
##  $ covariate          : num [1:38, 1:4] 1 0.983 0.928 0.873 0.859 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:38] "1" "2" "3" "4" ...
##   .. ..$ : chr [1:4] "R_D_Spend" "POM" "Logist_Market" "Pais"
##  $ model.list         :List of 2
##   ..$ response : chr "Profit"
##   ..$ variables: chr [1:4] "R_D_Spend" "POM" "Logist_Market" "Pais"
##  $ err.fct            :function (x, y)  
##   ..- attr(*, "type")= chr "sse"
##  $ act.fct            :function (x)  
##   ..- attr(*, "type")= chr "logistic"
##  $ linear.output      : logi TRUE
##  $ data               :'data.frame': 38 obs. of  6 variables:
##   ..$ R_D_Spend    : num [1:38] 1 0.983 0.928 0.873 0.859 ...
##   ..$ POM          : num [1:38] 0.652 0.762 0.38 0.513 0.305 ...
##   ..$ Logist_Market: num [1:38] 1 0.941 0.865 0.812 0.776 ...
##   ..$ Pais         : num [1:38] 0 0.5 1 0 1 0.5 1 0.5 1 0.5 ...
##   ..$ Profit       : num [1:38] 1 0.997 0.993 0.947 0.853 ...
##   ..$ Supervivencia: num [1:38] 1 1 1 1 1 1 1 0.5 0.5 0.5 ...
##  $ exclude            : NULL
##  $ net.result         :List of 1
##   ..$ : num [1:38, 1] 0.974 0.96 0.932 0.902 0.889 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : chr [1:38] "1" "2" "3" "4" ...
##   .. .. ..$ : NULL
##  $ weights            :List of 1
##   ..$ :List of 2
##   .. ..$ : num [1:5, 1] -1.6669 2.9771 -0.0746 0.5157 -0.0244
##   .. ..$ : num [1:2, 1] 0.0569 1.0724
##  $ generalized.weights:List of 1
##   ..$ : num [1:38, 1:4] 15.68 11.12 7.55 6.03 5.64 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : chr [1:38] "1" "2" "3" "4" ...
##   .. .. ..$ : NULL
##  $ startweights       :List of 1
##   ..$ :List of 2
##   .. ..$ : num [1:5, 1] -0.6355 0.4437 0.3312 -0.0529 -1.3376
##   .. ..$ : num [1:2, 1] -0.632 -0.58
##  $ result.matrix      : num [1:10, 1] 0.06428 0.00725 339 -1.66687 2.97714 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:10] "error" "reached.threshold" "steps" "Intercept.to.1layhid1" ...
##   .. ..$ : NULL
##  - attr(*, "class")= chr "nn"

Ploteo de la red Neuronal

plot(startups_model, rep = "best")

Ploteo de la red proporcional

Esto me indica cuales son los KPI

par(mar = numeric(4), family = 'serif')
plotnet(startups_model, alpha = 0.6)

Evaluación de la performance del modelo

model_results <- compute(startups_model,startups_test[1:4])
predicted_profit <- model_results$net.result

Predicted profit Vs Actual profit of test data.

cor(predicted_profit,startups_test$Profit)
##          [,1]
## [1,] 0.973849

Desnormalización de los resultados

Dado que hicimos la predicciones con los datos normalizados, ahora deberemos des-normalizarlos

str_max <- max(Startups$Profit)
str_min <- min(Startups$Profit)

unnormalize <- function(x, min, max) { 
  return( (max - min)*x + min )
}

ActualProfit_pred <- unnormalize(predicted_profit,str_min,str_max)
head(ActualProfit_pred)
##        [,1]
## 6  166817.6
## 9  155717.0
## 14 129753.0
## 17 118788.3
## 22 120662.2
## 23 116761.1

Mejoramiento de la performance del modelo

Es posible mejorar la performance con el agregado de más capas ocultas.

Startups_model2 <- neuralnet(Profit~R_D_Spend+ POM + Logist_Market + Pais , data = startups_train, hidden = c(2,4))

plot(Startups_model2 ,rep = "best")

Performance del modelo mejorado

model_results2<-compute(Startups_model2,startups_test[1:4])
predicted_Profit2<-model_results2$net.result
cor(predicted_Profit2,startups_test$Profit)
##           [,1]
## [1,] 0.9737566

Modelo Mejorado KPI

par(mar = numeric(4), family = 'serif')
plotnet(Startups_model2, alpha = 0.6)

Neural Net Clasificación

Armamos el dataset de datos a clasificar

library(nnet)

supervivencia <- as.factor(Categoric$Supervivencia)
R_D_Spend <- as.matrix(Categoric$R_D_Spend)
POM <- as.matrix(Categoric$POM)
Logist_Market <- as.matrix(Categoric$Logist_Market)
Clasificar <- data.frame (supervivencia,R_D_Spend,POM,Logist_Market)

Muestreo

indice <- sample(2, nrow(Clasificar), replace = TRUE, prob = c(0.7,0.3))

clasificar_train <- Startups_norm[indice==1,]
clasificar_test  <- Startups_norm[indice==2,]

supervivientes_clasificados <- factor(clasificar_train$Supervivencia)

Entrenamiento de nnet como clasificador

supervivientes_train<-nnet(supervivientes_clasificados~clasificar_train$R_D_Spend + clasificar_train$POM+ clasificar_train$Logist_Market ,data=clasificar_train,size=5, decay=5e-4, maxit=2000)
## # weights:  38
## initial  value 45.168420 
## iter  10 value 14.655054
## iter  20 value 3.731461
## iter  30 value 1.901369
## iter  40 value 1.692317
## iter  50 value 1.524971
## iter  60 value 1.455029
## iter  70 value 1.436595
## iter  80 value 1.422593
## iter  90 value 1.402283
## iter 100 value 1.359745
## iter 110 value 1.349081
## iter 120 value 1.338589
## iter 130 value 1.332154
## iter 140 value 1.328070
## iter 150 value 1.326972
## iter 160 value 1.326546
## iter 170 value 1.325602
## iter 180 value 1.324245
## iter 190 value 1.322318
## iter 200 value 1.320788
## iter 210 value 1.317902
## iter 220 value 1.317289
## iter 230 value 1.316978
## iter 240 value 1.316919
## iter 250 value 1.316900
## iter 260 value 1.316882
## iter 270 value 1.316852
## iter 280 value 1.316824
## iter 290 value 1.316774
## iter 300 value 1.316683
## iter 310 value 1.316667
## iter 320 value 1.316644
## iter 330 value 1.316622
## iter 340 value 1.316613
## iter 350 value 1.316605
## iter 360 value 1.316599
## iter 370 value 1.316593
## iter 380 value 1.316588
## iter 390 value 1.316566
## iter 400 value 1.316558
## iter 410 value 1.316555
## iter 420 value 1.316549
## iter 430 value 1.316547
## iter 440 value 1.316544
## iter 450 value 1.316540
## iter 460 value 1.316537
## iter 470 value 1.316531
## iter 480 value 1.316530
## iter 490 value 1.316527
## iter 500 value 1.316523
## iter 510 value 1.316522
## iter 520 value 1.316520
## iter 530 value 1.316519
## iter 540 value 1.316516
## iter 540 value 1.316516
## final  value 1.316516 
## converged

Visualización del clasificador

plotnet(supervivientes_train, alpha = 0.6)

Aramdos de set de entrenamiento y de predicción

https://stackoverrun.com/es/q/3338607

crs\(nnet <- nnet(as.factor(Target) ~ ., data=crs\)dataset[crs\(sample,c(crs\)input, crs$target)], size=10, skip=TRUE, MaxNWts=10000, trace=FALSE, maxit=100)