GA包---遗传算法
zsc 2015-11-09

标签:
#r

GA包—遗传算法

1、用法：(默认求解最大值)

ga(type = c("binary", "real-valued", "permutation"),   
   fitness, ...,                            #  fitness:适应度函数 
   min, max,                                #解得下界/解得上界(多元变量为一个向量)
   nBits,                               #一个种群用二进制编码的长度是多少(长度越大代表精度越高)
   population = gaControl(type)$population,     # 初始种群
   selection = gaControl(type)$selection,       #选择
   crossover = gaControl(type)$crossover,       #交叉
   mutation = gaControl(type)$mutation,         #变异
   popSize = 50,                               #种群大小
   pcrossover = 0.8,                           #交叉概率（默认0.8）
   pmutation = 0.1,                            #变异概率（默认0.1）
   elitism = base::max(1, round(popSize*0.05)),   #代沟(默认情况下,前5%个体将在每个迭代中保留)
   updatePop = FALSE,
   postFitness = NULL,
   maxiter = 100,                               # 最大迭代次数(默认100)
   run = maxiter,
   maxFitness = Inf,                            # 适应度函数的上界,GA搜索后中断
   names = NULL,
   suggestions = NULL, 
   optim = FALSE,
   optimArgs = list(method = "L-BFGS-B", 
                    poptim = 0.05,
                    pressel = 0.5,
                    control = list(fnscale = -1, maxit = 100)),
   keepBest = FALSE,                             #是否保留每一代的最优解
   parallel = FALSE,                             #是否采用并行运算
   monitor = if(interactive())                   #绘图用的,监控遗传算法的运行状况
               { if(is.RStudio()) gaMonitor else gaMonitor2 } 
             else FALSE,
   seed = NULL)         #一个整数值包含随机数发生器的状态。这个参数可以用来复制GA搜索的结果。

2、参数说明

type: 解得编码类型
1. binary :二进制编码
2. real-valued：实数浮点编码
3. permutation：问题涉及到重新排序的列表，字符串编码。可求解TSP问题
通过gaControl设置默认的遗传算子。检索当前设置操作:
- gaControl(“binary”)
- gaControl(“real-valued”)
- gaControl(“permutation”)

3.举例

3.1. 一元函数: $ |x|+cos(x) $

该函数有最小值f(0) = 1（ − *R* < *x* < + *R*）,这里我们将注意力集中在 − 20 < *x* < 20,这个函数可以用R来定义和绘图: + 函数图像

```R
library(GA)
f = function(x)  abs(x)+cos(x)
curve(f, -20, 20)
```

求解最小值

fitness = function(x) -f(x)  #由于这个函数默认求解最大值，所以我们求-f(x)的最大值
GA = ga(type = "real-valued", fitness = fitness, min = -20, max = 20)

GA  ##返回的结果GA为S4对象，一个GA类型 调用其属性用@符号 
#> An object of class "ga"
#> 
#> Call:
#> ga(type = "real-valued", fitness = fitness, min = -20, max = 20)
#> 
#> Available slots:
#>  [1] "call"         "type"         "lower"        "upper"        "nBits"       
#>  [6] "names"        "popSize"      "iter"         "run"          "maxiter"     
#> [11] "suggestions"  "population"   "elitism"      "pcrossover"   "pmutation"   
#> [16] "optim"        "fitness"      "summary"      "bestSol"      "fitnessValue"
#> [21] "solution"
summary(GA)
#> ─ Genetic Algorithm ────────── 
#> 
#> GA settings: 
#> Type                  =  real-valued 
#> Population size       =  50 
#> Number of generations =  100 
#> Elitism               =  2 
#> Crossover probability =  0.8 
#> Mutation probability  =  0.1 
#> Search domain = 
#>        x1
#> lower -20
#> upper  20
#> 
#> GA results: 
#> Iterations             = 100 
#> Fitness function value = -1.000093 
#> Solution = 
#>                x1
#> [1,] 9.324736e-05
plot(GA)  #  #默认情况下，ga函数通过打印每个迭代的平均值和最佳适应度值来监视搜索,画出每代最佳的适应度函数值与每代的平均适应度函数值

curve(f, -20, 20)  # 画出最优值
abline(v = GA@solution, lty = 3)

3.2. 一元函数: (x² + x)cos(x) 监视跟踪

f = function(x)  (x^2+x)*cos(x)      # -10 < x < 10
min = -10
max = 10
curve(f, min, max)#画出该函数的图形

###################
###求解最优值 ####
GA = ga(type = "real-valued", fitness = f, min = min, max = max)
summary(GA)
#> ─ Genetic Algorithm ────────── 
#> 
#> GA settings: 
#> Type                  =  real-valued 
#> Population size       =  50 
#> Number of generations =  100 
#> Elitism               =  2 
#> Crossover probability =  0.8 
#> Mutation probability  =  0.1 
#> Search domain = 
#>        x1
#> lower -10
#> upper  10
#> 
#> GA results: 
#> Iterations             = 100 
#> Fitness function value = 47.70562 
#> Solution = 
#>            x1
#> [1,] 6.560541

plot(GA)# 画图

xfun::pkg_load2('gifski') # 安装并加载该软件包

########################
##编写自己的跟踪功能,点代表一个解，来监视搜索
## 定义一个新的监视器函数，然后将此函数作为可选参数传递给ga:
monitor = function(obj) { 
    curve(f, -10, 10, main = paste("iteration =", obj@iter))
    points(obj@population, obj@fitness, pch = 20, col = 2)
    rug(obj@population, col = 2)
    Sys.sleep(0.2)}
## 监视函数

GA = ga(type = "real-valued", fitness = f, min = -10, max = 10, monitor = monitor)

##############
## 也可以储存为视频，观看，利用动画的函数做出视频  
# library(animation)    
# oopts = ani.options(ffmpeg = "F:/ffmpeg/bin/ffmpeg.exe")#在winds中设置ffmpeg    
# saveVideo({    
#   #打印图片    
#   GA = ga(type = "real-valued", fitness = f, min = -10, max = 10, monitor = monitor)    
#   ani.options(interval = 0.1, nmax = 250)    
# }, video.name = "jianshi.mp4", other.opts = "-b 500k")    
###############  

##########################
GA = ga(type = "real-valued", fitness = f, min = -10, max = 10, monitor = NULL)#最优值
summary(GA)
#> ─ Genetic Algorithm ────────── 
#> 
#> GA settings: 
#> Type                  =  real-valued 
#> Population size       =  50 
#> Number of generations =  100 
#> Elitism               =  2 
#> Crossover probability =  0.8 
#> Mutation probability  =  0.1 
#> Search domain = 
#>        x1
#> lower -10
#> upper  10
#> 
#> GA results: 
#> Iterations             = 100 
#> Fitness function value = 47.70562 
#> Solution = 
#>            x1
#> [1,] 6.560538

##画出图像
monitor(GA)
abline(v = GA@solution, lty = 3)

3.3. 二元函数: 20 + x₁² + x₂² − 10(cos(2*π**x₁) + c**os(2πx*₂))

Rastrigin = function(x1, x2) {
    20 + x1^2 + x2^2 - 10 * (cos(2 * pi * x1) + cos(2 * pi * x2))
}
x1 = seq(-5.12, 5.12, by = 0.1)
x2 = seq(-5.12, 5.12, by = 0.1)
f = outer(x1, x2, Rastrigin)
persp3D(x1, x2, f, theta = 50, phi = 20)

filled.contour(x1, x2, f, color.palette = jet.colors)

monitor = function(obj) {
    contour(x1, x2, f, drawlabels = FALSE, col = gray(0.5))
    title(paste("iteration =", obj@iter), font.main = 1)
    points(obj@population, pch = 20, col = 2)
    Sys.sleep(0.2)
}

GA = ga(type = "real-valued", fitness = function(x) -Rastrigin(x[1], x[2]), min = c(-5.12, 
    -5.12), max = c(5.12, 5.12), popSize = 50, maxiter = 100, monitor = NULL)  #定义monitor,暂时不运行

summary(GA)
#> ─ Genetic Algorithm ────────── 
#> 
#> GA settings: 
#> Type                  =  real-valued 
#> Population size       =  50 
#> Number of generations =  100 
#> Elitism               =  2 
#> Crossover probability =  0.8 
#> Mutation probability  =  0.1 
#> Search domain = 
#>          x1    x2
#> lower -5.12 -5.12
#> upper  5.12  5.12
#> 
#> GA results: 
#> Iterations             = 100 
#> Fitness function value = -0.0001561961 
#> Solution = 
#>                x1           x2
#> [1,] 0.0002792732 0.0008422104

plot(GA)

3.4.曲线拟合

XXX提出了一种利用树木生长数据的曲线拟合应用。利用一种被称为理查兹曲线（贝尔曼方程）的流行生态模型，将云杉树树干的体积与年龄的关系进行建模。该方程为 f(x) = a(1 − e^− *b*x)^c 这是一个非线性回归模型参数*θ* = (a, b, c)^T。使用二次损失函数(即非线性最小二乘法)，理查兹曲线可以使用遗传算法进行拟合:

 data("trees", package = "spuRs")  # 1200行* 3列(100颗数，每棵树一共有12次观察值)，其中三列为ID，年龄(Age)，体积(Vol)
 
 tree = trees[trees$ID == "1.3.11", 2:3]  #提取某一颗树， 一共有12个观测值  两个变量(Age,Vol)
 richards = function(x, theta) theta[1] * (1 - exp(-theta[2] * x))^theta[3] #上面的f(x)
 fitnessL2 = function(theta, x, y) -sum((y - richards(x, theta))^2)#损失函数，用非线性最小二乘法
 ###等价  fitnessL1 = function(theta, x, y) -sum(abs(y - richards(x, theta)))
 ###损失函数中绝对值与平方的关系
 GA2 = ga(type = "real-valued", fitness = fitnessL2,
            x = tree$Age, y = tree$Vol, min = c(3000, 0, 2), max = c(4000, 1, 4),
            popSize = 500, crossover = gareal_blxCrossover, maxiter = 5000,
            run = 200, names = c("a", "b", "c"))
 summary(GA2)
#> ─ Genetic Algorithm ────────── 
#> 
#> GA settings: 
#> Type                  =  real-valued 
#> Population size       =  500 
#> Number of generations =  5000 
#> Elitism               =  25 
#> Crossover probability =  0.8 
#> Mutation probability  =  0.1 
#> Search domain = 
#>          a b c
#> lower 3000 0 2
#> upper 4000 1 4
#> 
#> GA results: 
#> Iterations             = 2902 
#> Fitness function value = -2773.843 
#> Solution = 
#>            a          b        c
#> [1,] 3592.21 0.01544159 2.783316
 GA2@solution
#>            a          b        c
#> [1,] 3592.21 0.01544159 2.783316

注意，给定了x和y中的观测数据后，我们要求在参数θ 下，适应度函数fitnessL2最大化，后者在调用函数ga时提供了进一步的参数，并且在搜索过程中保持不变。

3.5子集的选择

给定一组预测因子，子集选择的目的是确定最相关的预测因子，以解释响应变量的变化。我们可以利用遗传算法来赛选出更好的相关因子，从而产生更好的估计和更清晰的回归系数解释。使用二进制字符串可以自然地处理赛选子集问题，其中1表示选择该变量，0表示不选择该变量。

候选子集的适合度可以通过几种模型选择标准之一来衡量，比如AIC、BIC等

我们首先从UsingR包中加载数据集，然后通过OLS拟合线性回归模型:

 data("fat", package = "UsingR")#252*19
 mod = lm(body.fat.siri ~ age + weight + height + neck + chest + abdomen +
                hip + thigh + knee + ankle + bicep + forearm + wrist, data = fat)
    #通过观察，因变量（body.fat.siri） 与上述 13个自变量有关系
 summary(mod)
#> 
#> Call:
#> lm(formula = body.fat.siri ~ age + weight + height + neck + chest + 
#>     abdomen + hip + thigh + knee + ankle + bicep + forearm + 
#>     wrist, data = fat)
#> 
#> Residuals:
#>      Min       1Q   Median       3Q      Max 
#> -11.1687  -2.8639  -0.1014   3.2085  10.0068 
#> 
#> Coefficients:
#>              Estimate Std. Error t value Pr(>|t|)    
#> (Intercept) -18.18849   17.34857  -1.048  0.29551    
#> age           0.06208    0.03235   1.919  0.05618 .  
#> weight       -0.08844    0.05353  -1.652  0.09978 .  
#> height       -0.06959    0.09601  -0.725  0.46925    
#> neck         -0.47060    0.23247  -2.024  0.04405 *  
#> chest        -0.02386    0.09915  -0.241  0.81000    
#> abdomen       0.95477    0.08645  11.044  < 2e-16 ***
#> hip          -0.20754    0.14591  -1.422  0.15622    
#> thigh         0.23610    0.14436   1.636  0.10326    
#> knee          0.01528    0.24198   0.063  0.94970    
#> ankle         0.17400    0.22147   0.786  0.43285    
#> bicep         0.18160    0.17113   1.061  0.28966    
#> forearm       0.45202    0.19913   2.270  0.02410 *  
#> wrist        -1.62064    0.53495  -3.030  0.00272 ** 
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 4.305 on 238 degrees of freedom
#> Multiple R-squared:  0.749,  Adjusted R-squared:  0.7353 
#> F-statistic: 54.65 on 13 and 238 DF,  p-value: < 2.2e-16

设计矩阵(没有截距)和响应变量从拟合模型对象中提取:

 x = model.matrix(mod)[, -1] # 252*13  选取13 个自变量的列，构建自变量
 y = model.response(model.frame(mod)) # 提取因变量的列，

那么，最大化的适应度函数可以定义为:

   fitness = function(string) {
     inc = which(string == 1)
     X = cbind(1, x[,inc])
     mod = lm.fit(X, y)
     class(mod) = "lm"
    -AIC(mod)
  }

这仅仅是利用字符串对应位置上的1所标识的预测因子来估计回归模型，并返回所选标准的负值。注意，截取项总是包含在内，并且我们使用基本的lm。拟合函数加速计算。下面的R代码运行GA:

 GA = ga("binary", fitness = fitness, nBits = ncol(x),names = colnames(x))# monitor = plot
 plot(GA)

 summary(GA)
#> ─ Genetic Algorithm ────────── 
#> 
#> GA settings: 
#> Type                  =  binary 
#> Population size       =  50 
#> Number of generations =  100 
#> Elitism               =  2 
#> Crossover probability =  0.8 
#> Mutation probability  =  0.1 
#> 
#> GA results: 
#> Iterations             = 100 
#> Fitness function value = -1458.996 
#> Solution = 
#>      age weight height neck chest abdomen hip thigh knee ankle bicep forearm
#> [1,]   1      1      0    1     0       1   1     1    0     0     0       1
#>      wrist
#> [1,]     1

利用GA找到的最优子集得到的线性回归模型如下:

  mod2 = lm(body.fat.siri ~ .,
               data = data.frame(body.fat.siri = y, x[,GA@solution == 1]))
  summary(mod2)
#> 
#> Call:
#> lm(formula = body.fat.siri ~ ., data = data.frame(body.fat.siri = y, 
#>     x[, GA@solution == 1]))
#> 
#> Residuals:
#>      Min       1Q   Median       3Q      Max 
#> -10.9757  -2.9937  -0.1644   2.9766  10.2244 
#> 
#> Coefficients:
#>              Estimate Std. Error t value Pr(>|t|)    
#> (Intercept) -22.65637   11.71385  -1.934  0.05426 .  
#> age           0.06578    0.03078   2.137  0.03356 *  
#> weight       -0.08985    0.03991  -2.252  0.02524 *  
#> neck         -0.46656    0.22462  -2.077  0.03884 *  
#> abdomen       0.94482    0.07193  13.134  < 2e-16 ***
#> hip          -0.19543    0.13847  -1.411  0.15940    
#> thigh         0.30239    0.12904   2.343  0.01992 *  
#> forearm       0.51572    0.18631   2.768  0.00607 ** 
#> wrist        -1.53665    0.50939  -3.017  0.00283 ** 
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 4.282 on 243 degrees of freedom
#> Multiple R-squared:  0.7466, Adjusted R-squared:  0.7382 
#> F-statistic: 89.47 on 8 and 243 DF,  p-value: < 2.2e-16

上述两种解决方案相比，

3.6、约束优化

背包问题:假定背包的最大容量为W，N件物品，每件物品都有自己的价值和重量，将物品放入背包中使得背包内物品的总价值最大。

如果第i项选择为背包，则xi = 1，不选则xi = 0。考虑以下数据，利润(p)，权重(w)和容量(V):

 p = c(6, 5, 8, 9, 6, 7, 3)
 w = c(2, 3, 6, 7, 5, 9, 4)
 V = 14
#利用二进制遗传算法可以解决背包问题，但由于不等式约束，并不是所有可行解都可行。我们可以通过惩罚不可行的解决办法来考虑约束。因此，适应度函数可以定义如下:
 knapsack = function(x) {
   f = sum(x * p)
   penalty = sum(w) * abs(sum(x * w)-V)
   f - penalty
   }
#当目标函数f被惩罚时，一个量取决于提出的解决方案的容量与背包容量之间的距离。然后:
 GA = ga(type = "binary", fitness = knapsack, nBits = length(w),
             maxiter = 1000, run = 200, popSize = 20)
 summary(GA)
#> ─ Genetic Algorithm ────────── 
#> 
#> GA settings: 
#> Type                  =  binary 
#> Population size       =  20 
#> Number of generations =  1000 
#> Elitism               =  1 
#> Crossover probability =  0.8 
#> Mutation probability  =  0.1 
#> 
#> GA results: 
#> Iterations             = 200 
#> Fitness function value = 21 
#> Solution = 
#>      x1 x2 x3 x4 x5 x6 x7
#> [1,]  1  0  0  1  1  0  0
 sum(p * GA@solution)
#> [1] 21
 sum(w * GA@solution)
#> [1] 14

3.7. 解决TSP问题

有若干个城市(n个)，任何两个城市之间的距离都是确定的，现要求一旅行商从某城市出发必须经过每一个城市且只在一个城市逗留一次，最后回到出发的城市，问如何事先确定一条最短的线路。可行的解决方案的设置是由可能的途径的总数,等于(n−1)!/2.

考虑一个简单的例子，现在共有A、B、C、D四个城市。

为了将图中关系数据化，可用如下规则来描述：
城市映射为编号：A——1，B——2，C——3，D——4；
城市之间的距离用二维数组来表示，记为D[i][j]，如：D[1][2]表示城市A与城市B之间的距离，于是D[1][2]=7，同时也表示我现在处于A–1城市，将要去B–2城市；
用一维数组S[i]来存储访问过的路径。比如随机访问路径1–3–2–4–1，求其总和为D₁₃ + D₃₂ + D₂₄ + D₄₁ 观察下标，发现和路径成偏移一位的状态，

可用遗传算法解决此类问题

#city数据结构 一共有31个城市，计算出每两个城市之间的距离，用距离矩阵表示
D=as.matrix(dist(city)) # 距离矩阵
#定义总的路线长度
tourLength = function(tour, distMatrix) {
   tour = c(tour, tour[1]) #设置为回路，代表所走的路径
   route = embed(tour, 2)[,2:1]#根据回路,产生对应的距离矩阵的下标,每一行代表一个坐标，第一行处于i坐标
   sum(distMatrix[route])
}
tspFitness = function(tour, ...) 1/tourLength(tour, ...)

GA3 = ga(type = "permutation", fitness = tspFitness, distMatrix = D,
          min = 1, max = dim(D)[1], popSize = 50, maxiter = 5000,
          run = 500, pmutation = 0.2)

summary(GA3)
#> ─ Genetic Algorithm ────────── 
#> 
#> GA settings: 
#> Type                  =  permutation 
#> Population size       =  50 
#> Number of generations =  5000 
#> Elitism               =  2 
#> Crossover probability =  0.8 
#> Mutation probability  =  0.2 
#> 
#> GA results: 
#> Iterations             = 3424 
#> Fitness function value = 5.952314e-05 
#> Solutions = 
#>      x1 x2 x3 x4 x5 x6 x7 x8 x9 x10  ...  x30 x31
#> [1,] 14 15  1 29 11  6 23 16 24  25        13  12
#> [2,]  7 13 12 14 15  1 29 11  6  23         4   5

#找到的解决方案对应于一条独特的路径，其行程长度等于:
apply(GA3@solution, 1, tourLength, D)
#> [1] 16800.19 16800.19

图形对比

library(ggplot2)
library(tibble)
city=rownames_to_column(city, var = "rowname")
#原始图像
# p=ggplot(data=city,aes(x=X1,y=X2))+geom_point(shape=19,size=4,col="red")+theme_bw()+
#   geom_text(aes(label=rowname),size=4,vjust=1.5)+geom_path(linetype=2)+ggtitle("初始顺序")
library(dplyr)
city_solution=data.frame(solution=as.factor(GA3@solution[1,]))
re_city=inner_join(city_solution,city,by=c("solution"="rowname"))
#re_city #最优解城市顺序
##最优解城市图像
j=ggplot(data=re_city,aes(x=X1,y=X2))+geom_point(shape=19,size=4,col="red")+theme_bw()+
  geom_text(aes(label=solution),size=4,vjust=1.5)+geom_path(linetype=2)+
  ggtitle("最优解")+
  geom_point(x=re_city[1,2],y=re_city[1,3],shape=17,size=5,col="blue")+
   theme(plot.title = element_text(hjust = 0.5)) + theme(text=element_text(family="Songti SC"))

j

sessionInfo()
#> R version 4.0.2 (2020-06-22)
#> Platform: x86_64-apple-darwin17.0 (64-bit)
#> Running under: macOS Mojave 10.14.5
#> 
#> Matrix products: default
#> BLAS:   /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
#> 
#> locale:
#> [1] zh_CN.UTF-8/zh_CN.UTF-8/zh_CN.UTF-8/C/zh_CN.UTF-8/zh_CN.UTF-8
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#> [1] dplyr_1.0.1      tibble_3.0.3     ggplot2_3.3.2    igraph_1.2.5    
#> [5] GA_3.2           iterators_1.0.12 foreach_1.5.0   
#> 
#> loaded via a namespace (and not attached):
#>  [1] Rcpp_1.0.5       compiler_4.0.2   pillar_1.4.6     formatR_1.7     
#>  [5] tools_4.0.2      digest_0.6.25    evaluate_0.14    lifecycle_0.2.0 
#>  [9] gtable_0.3.0     pkgconfig_2.0.3  rlang_0.4.7      cli_2.0.2       
#> [13] yaml_2.2.1       xfun_0.17        withr_2.2.0      stringr_1.4.0   
#> [17] knitr_1.29       generics_0.0.2   vctrs_0.3.2      tidyselect_1.1.0
#> [21] grid_4.0.2       glue_1.4.1       R6_2.4.1         gifski_0.8.6    
#> [25] fansi_0.4.1      rmarkdown_2.3    farver_2.0.3     purrr_0.3.4     
#> [29] magrittr_1.5     scales_1.1.1     codetools_0.2-16 htmltools_0.5.0 
#> [33] ellipsis_0.3.1   assertthat_0.2.1 colorspace_1.4-1 labeling_0.3    
#> [37] stringi_1.4.6    munsell_0.5.0    crayon_1.3.4