dplyr -- 2实战

把前两天的两篇文章合并,解决方法:名字还是不能太长,在content目录下新建test目录,把它放在content目录下的test目录,不放在post目录,我的test目录只有两篇文章

1.1、选择行filter()

安装nycflights13包,该软件包中的飞机航班数据将用于本文中dplyr包各个函数的演示

library(dplyr)
library(nycflights13)

函数tbl_df()将过长过大的数据集转换为显示更友好的 tbl_df 类型:

flights <- tbl_df(flights)
head(flights) #有336,776 x 19
## # A tibble: 6 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
## 1  2013     1     1      517            515         2      830            819
## 2  2013     1     1      533            529         4      850            830
## 3  2013     1     1      542            540         2      923            850
## 4  2013     1     1      544            545        -1     1004           1022
## 5  2013     1     1      554            600        -6      812            837
## 6  2013     1     1      554            558        -4      740            728
## # … with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
filter(flights,origin == "JFK",month == 6L) #- 获取六月份所有从”JFK”机场起飞的航班
## # A tibble: 9,472 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     6     1        2           2359         3      341            350
##  2  2013     6     1      538            545        -7      925            922
##  3  2013     6     1      539            540        -1      832            840
##  4  2013     6     1      553            600        -7      700            711
##  5  2013     6     1      554            600        -6      851            908
##  6  2013     6     1      557            600        -3      934            942
##  7  2013     6     1      559            600        -1      856            930
##  8  2013     6     1      606            610        -4      847            906
##  9  2013     6     1      609            615        -6      759            808
## 10  2013     6     1      615            610         5      837            847
## # … with 9,462 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
slice(flights,1:2) #选取前面的1:2行
## # A tibble: 2 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
## 1  2013     1     1      517            515         2      830            819
## 2  2013     1     1      533            529         4      850            830
## # … with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
sample_n(flights, 4, replace = TRUE)# 随机选取4条数据记录。
## # A tibble: 4 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
## 1  2013     8    22     1857           1705       112     2150           2020
## 2  2013     5    27     1958           1925        33     2152           2129
## 3  2013     9    10     1345           1345         0     1520           1520
## 4  2013    11    21     1058           1100        -2     1311           1304
## # … with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
flights %>% top_n(4,dep_time)
## # A tibble: 29 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013    10    30     2400           2359         1      327            337
##  2  2013    11    27     2400           2359         1      515            445
##  3  2013    12     5     2400           2359         1      427            440
##  4  2013    12     9     2400           2359         1      432            440
##  5  2013    12     9     2400           2250        70       59           2356
##  6  2013    12    13     2400           2359         1      432            440
##  7  2013    12    19     2400           2359         1      434            440
##  8  2013    12    29     2400           1700       420      302           2025
##  9  2013     2     7     2400           2359         1      432            436
## 10  2013     2     7     2400           2359         1      443            444
## # … with 19 more rows, and 11 more variables: arr_delay <dbl>, carrier <chr>,
## #   flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## #   distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

1.2、选择列 select()

若要对选择的列进行处理用mutate函数,这里只能对列名进行处理.

ans <- flights %>% select(dep_time,arr_time)
head(ans)
## # A tibble: 6 x 2
##   dep_time arr_time
##      <int>    <int>
## 1      517      830
## 2      533      850
## 3      542      923
## 4      544     1004
## 5      554      812
## 6      554      740
ans <- flights %>% select(day:arr_delay)
head(ans)
## # A tibble: 6 x 7
##     day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay
##   <int>    <int>          <int>     <dbl>    <int>          <int>     <dbl>
## 1     1      517            515         2      830            819        11
## 2     1      533            529         4      850            830        20
## 3     1      542            540         2      923            850        33
## 4     1      544            545        -1     1004           1022       -18
## 5     1      554            600        -6      812            837       -25
## 6     1      554            558        -4      740            728        12
#选择整数列,若要对选择的列进行处理用mutate函数,这里只能对列名进行处理,比如
ans <- flights %>% head() %>% select_if(is.integer,toupper)
ans
## # A tibble: 6 x 8
##    YEAR MONTH   DAY DEP_TIME SCHED_DEP_TIME ARR_TIME SCHED_ARR_TIME FLIGHT
##   <int> <int> <int>    <int>          <int>    <int>          <int>  <int>
## 1  2013     1     1      517            515      830            819   1545
## 2  2013     1     1      533            529      850            830   1714
## 3  2013     1     1      542            540      923            850   1141
## 4  2013     1     1      544            545     1004           1022    725
## 5  2013     1     1      554            600      812            837    461
## 6  2013     1     1      554            558      740            728   1696
#选择是字符串的列
ans <- flights %>% head() %>% select_if(is.character)
ans
## # A tibble: 6 x 4
##   carrier tailnum origin dest 
##   <chr>   <chr>   <chr>  <chr>
## 1 UA      N14228  EWR    IAH  
## 2 UA      N24211  LGA    IAH  
## 3 AA      N619AA  JFK    MIA  
## 4 B6      N804JB  JFK    BQN  
## 5 DL      N668DN  LGA    ATL  
## 6 UA      N39463  EWR    ORD

选择数值列并且以某个字符开始的列

select_if(iris,is.numeric) %>% select(starts_with("s")) %>% head()
##   Sepal.Length Sepal.Width
## 1          5.1         3.5
## 2          4.9         3.0
## 3          4.7         3.2
## 4          4.6         3.1
## 5          5.0         3.6
## 6          5.4         3.9

选择非数值列

# 选择非数值列
## 利用purrr包中的negate函数
library(purrr)
iris %>% select_if(negate(is.numeric)) %>% head()
##   Species
## 1  setosa
## 2  setosa
## 3  setosa
## 4  setosa
## 5  setosa
## 6  setosa
iris %>% select_if(~!is.numeric(.x)) %>% head()
##   Species
## 1  setosa
## 2  setosa
## 3  setosa
## 4  setosa
## 5  setosa
## 6  setosa
iris %>% select_if(funs(!is.numeric(.))) %>% head()
##   Species
## 1  setosa
## 2  setosa
## 3  setosa
## 4  setosa
## 5  setosa
## 6  setosa
iris %>% select_if(~ is.factor(.x)|is.numeric(.x))
##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 1            5.1         3.5          1.4         0.2     setosa
## 2            4.9         3.0          1.4         0.2     setosa
## 3            4.7         3.2          1.3         0.2     setosa
## 4            4.6         3.1          1.5         0.2     setosa
## 5            5.0         3.6          1.4         0.2     setosa
## 6            5.4         3.9          1.7         0.4     setosa
## 7            4.6         3.4          1.4         0.3     setosa
## 8            5.0         3.4          1.5         0.2     setosa
## 9            4.4         2.9          1.4         0.2     setosa
## 10           4.9         3.1          1.5         0.1     setosa
## 11           5.4         3.7          1.5         0.2     setosa
## 12           4.8         3.4          1.6         0.2     setosa
## 13           4.8         3.0          1.4         0.1     setosa
## 14           4.3         3.0          1.1         0.1     setosa
## 15           5.8         4.0          1.2         0.2     setosa
## 16           5.7         4.4          1.5         0.4     setosa
## 17           5.4         3.9          1.3         0.4     setosa
## 18           5.1         3.5          1.4         0.3     setosa
## 19           5.7         3.8          1.7         0.3     setosa
## 20           5.1         3.8          1.5         0.3     setosa
## 21           5.4         3.4          1.7         0.2     setosa
## 22           5.1         3.7          1.5         0.4     setosa
## 23           4.6         3.6          1.0         0.2     setosa
## 24           5.1         3.3          1.7         0.5     setosa
## 25           4.8         3.4          1.9         0.2     setosa
## 26           5.0         3.0          1.6         0.2     setosa
## 27           5.0         3.4          1.6         0.4     setosa
## 28           5.2         3.5          1.5         0.2     setosa
## 29           5.2         3.4          1.4         0.2     setosa
## 30           4.7         3.2          1.6         0.2     setosa
## 31           4.8         3.1          1.6         0.2     setosa
## 32           5.4         3.4          1.5         0.4     setosa
## 33           5.2         4.1          1.5         0.1     setosa
## 34           5.5         4.2          1.4         0.2     setosa
## 35           4.9         3.1          1.5         0.2     setosa
## 36           5.0         3.2          1.2         0.2     setosa
## 37           5.5         3.5          1.3         0.2     setosa
## 38           4.9         3.6          1.4         0.1     setosa
## 39           4.4         3.0          1.3         0.2     setosa
## 40           5.1         3.4          1.5         0.2     setosa
## 41           5.0         3.5          1.3         0.3     setosa
## 42           4.5         2.3          1.3         0.3     setosa
## 43           4.4         3.2          1.3         0.2     setosa
## 44           5.0         3.5          1.6         0.6     setosa
## 45           5.1         3.8          1.9         0.4     setosa
## 46           4.8         3.0          1.4         0.3     setosa
## 47           5.1         3.8          1.6         0.2     setosa
## 48           4.6         3.2          1.4         0.2     setosa
## 49           5.3         3.7          1.5         0.2     setosa
## 50           5.0         3.3          1.4         0.2     setosa
## 51           7.0         3.2          4.7         1.4 versicolor
## 52           6.4         3.2          4.5         1.5 versicolor
## 53           6.9         3.1          4.9         1.5 versicolor
## 54           5.5         2.3          4.0         1.3 versicolor
## 55           6.5         2.8          4.6         1.5 versicolor
## 56           5.7         2.8          4.5         1.3 versicolor
## 57           6.3         3.3          4.7         1.6 versicolor
## 58           4.9         2.4          3.3         1.0 versicolor
## 59           6.6         2.9          4.6         1.3 versicolor
## 60           5.2         2.7          3.9         1.4 versicolor
## 61           5.0         2.0          3.5         1.0 versicolor
## 62           5.9         3.0          4.2         1.5 versicolor
## 63           6.0         2.2          4.0         1.0 versicolor
## 64           6.1         2.9          4.7         1.4 versicolor
## 65           5.6         2.9          3.6         1.3 versicolor
## 66           6.7         3.1          4.4         1.4 versicolor
## 67           5.6         3.0          4.5         1.5 versicolor
## 68           5.8         2.7          4.1         1.0 versicolor
## 69           6.2         2.2          4.5         1.5 versicolor
## 70           5.6         2.5          3.9         1.1 versicolor
## 71           5.9         3.2          4.8         1.8 versicolor
## 72           6.1         2.8          4.0         1.3 versicolor
## 73           6.3         2.5          4.9         1.5 versicolor
## 74           6.1         2.8          4.7         1.2 versicolor
## 75           6.4         2.9          4.3         1.3 versicolor
## 76           6.6         3.0          4.4         1.4 versicolor
## 77           6.8         2.8          4.8         1.4 versicolor
## 78           6.7         3.0          5.0         1.7 versicolor
## 79           6.0         2.9          4.5         1.5 versicolor
## 80           5.7         2.6          3.5         1.0 versicolor
## 81           5.5         2.4          3.8         1.1 versicolor
## 82           5.5         2.4          3.7         1.0 versicolor
## 83           5.8         2.7          3.9         1.2 versicolor
## 84           6.0         2.7          5.1         1.6 versicolor
## 85           5.4         3.0          4.5         1.5 versicolor
## 86           6.0         3.4          4.5         1.6 versicolor
## 87           6.7         3.1          4.7         1.5 versicolor
## 88           6.3         2.3          4.4         1.3 versicolor
## 89           5.6         3.0          4.1         1.3 versicolor
## 90           5.5         2.5          4.0         1.3 versicolor
## 91           5.5         2.6          4.4         1.2 versicolor
## 92           6.1         3.0          4.6         1.4 versicolor
## 93           5.8         2.6          4.0         1.2 versicolor
## 94           5.0         2.3          3.3         1.0 versicolor
## 95           5.6         2.7          4.2         1.3 versicolor
## 96           5.7         3.0          4.2         1.2 versicolor
## 97           5.7         2.9          4.2         1.3 versicolor
## 98           6.2         2.9          4.3         1.3 versicolor
## 99           5.1         2.5          3.0         1.1 versicolor
## 100          5.7         2.8          4.1         1.3 versicolor
## 101          6.3         3.3          6.0         2.5  virginica
## 102          5.8         2.7          5.1         1.9  virginica
## 103          7.1         3.0          5.9         2.1  virginica
## 104          6.3         2.9          5.6         1.8  virginica
## 105          6.5         3.0          5.8         2.2  virginica
## 106          7.6         3.0          6.6         2.1  virginica
## 107          4.9         2.5          4.5         1.7  virginica
## 108          7.3         2.9          6.3         1.8  virginica
## 109          6.7         2.5          5.8         1.8  virginica
## 110          7.2         3.6          6.1         2.5  virginica
## 111          6.5         3.2          5.1         2.0  virginica
## 112          6.4         2.7          5.3         1.9  virginica
## 113          6.8         3.0          5.5         2.1  virginica
## 114          5.7         2.5          5.0         2.0  virginica
## 115          5.8         2.8          5.1         2.4  virginica
## 116          6.4         3.2          5.3         2.3  virginica
## 117          6.5         3.0          5.5         1.8  virginica
## 118          7.7         3.8          6.7         2.2  virginica
## 119          7.7         2.6          6.9         2.3  virginica
## 120          6.0         2.2          5.0         1.5  virginica
## 121          6.9         3.2          5.7         2.3  virginica
## 122          5.6         2.8          4.9         2.0  virginica
## 123          7.7         2.8          6.7         2.0  virginica
## 124          6.3         2.7          4.9         1.8  virginica
## 125          6.7         3.3          5.7         2.1  virginica
## 126          7.2         3.2          6.0         1.8  virginica
## 127          6.2         2.8          4.8         1.8  virginica
## 128          6.1         3.0          4.9         1.8  virginica
## 129          6.4         2.8          5.6         2.1  virginica
## 130          7.2         3.0          5.8         1.6  virginica
## 131          7.4         2.8          6.1         1.9  virginica
## 132          7.9         3.8          6.4         2.0  virginica
## 133          6.4         2.8          5.6         2.2  virginica
## 134          6.3         2.8          5.1         1.5  virginica
## 135          6.1         2.6          5.6         1.4  virginica
## 136          7.7         3.0          6.1         2.3  virginica
## 137          6.3         3.4          5.6         2.4  virginica
## 138          6.4         3.1          5.5         1.8  virginica
## 139          6.0         3.0          4.8         1.8  virginica
## 140          6.9         3.1          5.4         2.1  virginica
## 141          6.7         3.1          5.6         2.4  virginica
## 142          6.9         3.1          5.1         2.3  virginica
## 143          5.8         2.7          5.1         1.9  virginica
## 144          6.8         3.2          5.9         2.3  virginica
## 145          6.7         3.3          5.7         2.5  virginica
## 146          6.7         3.0          5.2         2.3  virginica
## 147          6.3         2.5          5.0         1.9  virginica
## 148          6.5         3.0          5.2         2.0  virginica
## 149          6.2         3.4          5.4         2.3  virginica
## 150          5.9         3.0          5.1         1.8  virginica

行同时选择——子集

filter(flights,origin == "JFK",month == 6L) %>%select(day:arr_delay)
## # A tibble: 9,472 x 7
##      day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay
##    <int>    <int>          <int>     <dbl>    <int>          <int>     <dbl>
##  1     1        2           2359         3      341            350        -9
##  2     1      538            545        -7      925            922         3
##  3     1      539            540        -1      832            840        -8
##  4     1      553            600        -7      700            711       -11
##  5     1      554            600        -6      851            908       -17
##  6     1      557            600        -3      934            942        -8
##  7     1      559            600        -1      856            930       -34
##  8     1      606            610        -4      847            906       -19
##  9     1      609            615        -6      759            808        -9
## 10     1      615            610         5      837            847       -10
## # … with 9,462 more rows

1.3、排序arrange()

ans <- arrange(flights,origin,desc(dest))  #对列名加 desc() 进行倒序: 与基本函数order()类似
head(ans)
## # A tibble: 6 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
## 1  2013     1     2      905            822        43     1313           1045
## 2  2013     1     3      848            850        -2     1149           1113
## 3  2013     1     4      901            850        11     1120           1113
## 4  2013     1     6      843            848        -5     1053           1111
## 5  2013     1     7      858            850         8     1105           1113
## 6  2013     1     8      847            850        -3     1116           1113
## # … with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

1.4、添加新变量mutate

在引用函数是 .funs() 的格式为funs(mean(.)) e.g.transmute_if(df,is.integer,funs(as.character(.)))

# 添加新变量(可以多列) 
flights %>% mutate(yanwu=arr_delay + dep_delay) %>% head()
## # A tibble: 6 x 20
##    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
## 1  2013     1     1      517            515         2      830            819
## 2  2013     1     1      533            529         4      850            830
## 3  2013     1     1      542            540         2      923            850
## 4  2013     1     1      544            545        -1     1004           1022
## 5  2013     1     1      554            600        -6      812            837
## 6  2013     1     1      554            558        -4      740            728
## # … with 12 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>, yanwu <dbl>
flights %>% transmute(yanwu=arr_delay + dep_delay) %>% head()
## # A tibble: 6 x 1
##   yanwu
##   <dbl>
## 1    13
## 2    24
## 3    35
## 4   -19
## 5   -31
## 6     8

有多少航班完全没有延误

#有多少航班完全没有延误
flights %>% mutate(yanwu=arr_delay + dep_delay) %>% filter(yanwu<0) %>% head()
## # A tibble: 6 x 20
##    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
## 1  2013     1     1      544            545        -1     1004           1022
## 2  2013     1     1      554            600        -6      812            837
## 3  2013     1     1      557            600        -3      709            723
## 4  2013     1     1      557            600        -3      838            846
## 5  2013     1     1      558            600        -2      849            851
## 6  2013     1     1      558            600        -2      853            856
## # … with 12 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>, yanwu <dbl>
flights %>%head() %>%  transmute_if(is.integer,funs(as.character(.)))  #把整数类型的列转换为字符串  
## # A tibble: 6 x 8
##   year  month day   dep_time sched_dep_time arr_time sched_arr_time flight
##   <chr> <chr> <chr> <chr>    <chr>          <chr>    <chr>          <chr> 
## 1 2013  1     1     517      515            830      819            1545  
## 2 2013  1     1     533      529            850      830            1714  
## 3 2013  1     1     542      540            923      850            1141  
## 4 2013  1     1     544      545            1004     1022           725   
## 5 2013  1     1     554      600            812      837            461   
## 6 2013  1     1     554      558            740      728            1696
flights 
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # … with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
# 可以在同一语句中对刚新增加的列进行操作:
transmute(flights, 
       gain = arr_delay + dep_delay, 
       gain_per_hour = gain / (arr_time / 60))
## # A tibble: 336,776 x 2
##     gain gain_per_hour
##    <dbl>         <dbl>
##  1    13         0.940
##  2    24         1.69 
##  3    35         2.28 
##  4   -19        -1.14 
##  5   -31        -2.29 
##  6     8         0.649
##  7    14         0.920
##  8   -17        -1.44 
##  9   -11        -0.788
## 10     6         0.478
## # … with 336,766 more rows

1.5 汇总(行): summarise()

对数据框调用其它函数进行汇总操作, 返回一维的结果:先用一个简单的数据集iris

iris %>% group_by(Species) %>% summarise(mean(Sepal.Length,na.rm = T))
## # A tibble: 3 x 2
##   Species    `mean(Sepal.Length, na.rm = T)`
##   <fct>                                <dbl>
## 1 setosa                                5.01
## 2 versicolor                            5.94
## 3 virginica                             6.59
iris %>% group_by(Species) %>% summarise_each(funs(mean))
## # A tibble: 3 x 5
##   Species    Sepal.Length Sepal.Width Petal.Length Petal.Width
##   <fct>             <dbl>       <dbl>        <dbl>       <dbl>
## 1 setosa             5.01        3.43         1.46       0.246
## 2 versicolor         5.94        2.77         4.26       1.33 
## 3 virginica          6.59        2.97         5.55       2.03
iris %>% group_by(Species) %>% summarise_each(funs(min(.),max(.)))
## # A tibble: 3 x 9
##   Species Sepal.Length_min Sepal.Width_min Petal.Length_min Petal.Width_min
##   <fct>              <dbl>           <dbl>            <dbl>           <dbl>
## 1 setosa               4.3             2.3              1               0.1
## 2 versic…              4.9             2                3               1  
## 3 virgin…              4.9             2.2              4.5             1.4
## # … with 4 more variables: Sepal.Length_max <dbl>, Sepal.Width_max <dbl>,
## #   Petal.Length_max <dbl>, Petal.Width_max <dbl>
iris %>% group_by(Sepal.Width) %>% summarise(f_count=n()) %>% arrange(desc(f_count)) 
## # A tibble: 23 x 2
##    Sepal.Width f_count
##          <dbl>   <int>
##  1         3        26
##  2         2.8      14
##  3         3.2      13
##  4         3.4      12
##  5         3.1      11
##  6         2.9      10
##  7         2.7       9
##  8         2.5       8
##  9         3.3       6
## 10         3.5       6
## # … with 13 more rows
#上述等价iris %>% group_by(Sepal.Width) %>% tally(sort=TRUE)
#tally可以一步完成上述工作,group_by()以后第一次使用tally进行n()操作,再一次就是sum(n) sort=TRUE对结果排序,当等于TRUE是降序
#再次使用tally()就是sum()
iris %>% group_by(Sepal.Width) %>% tally(sort=TRUE)
## # A tibble: 23 x 2
##    Sepal.Width     n
##          <dbl> <int>
##  1         3      26
##  2         2.8    14
##  3         3.2    13
##  4         3.4    12
##  5         3.1    11
##  6         2.9    10
##  7         2.7     9
##  8         2.5     8
##  9         3.3     6
## 10         3.5     6
## # … with 13 more rows
iris %>% group_by(Sepal.Width) %>% tally(sort=TRUE)%>%tally()
## # A tibble: 1 x 1
##       n
##   <int>
## 1    23

1.6、分组动作

以上5个函数已经很方便了, 但是当它们跟分组操作这个概念结合起来时, 那才叫真正的强大! 当对数据集通过 group_by()添加了分组信息后,filter(),select(),mutate(), arrange()summarise() 函数会自动对这些 tbl 类数据执行分组操作 (R语言泛型函数的优势).

例如: 对飞机航班数据按飞机编号 (tailnum) 进行分组, 计算该飞机航班的次数 (count = n()), 平均飞行距离 (dist = mean(distance, na.rm = TRUE)) 和 延时 (delay = mean(arr_delay, na.rm = TRUE))

ans=flights %>%
        group_by(tailnum) %>%
            summarise(count=n(),dist=mean(distance,na.rm = T),delay=mean(arr_delay,na.rm = T))
ans %>% head()
## # A tibble: 6 x 4
##   tailnum count  dist delay
##   <chr>   <int> <dbl> <dbl>
## 1 D942DN      4  854. 31.5 
## 2 N0EGMQ    371  676.  9.98
## 3 N10156    153  758. 12.7 
## 4 N102UW     48  536.  2.94
## 5 N103US     46  535. -6.93
## 6 N104UW     47  535.  1.80
ans %>% filter(count>20,dist<2000)
## # A tibble: 2,962 x 4
##    tailnum count  dist  delay
##    <chr>   <int> <dbl>  <dbl>
##  1 N0EGMQ    371  676.  9.98 
##  2 N10156    153  758. 12.7  
##  3 N102UW     48  536.  2.94 
##  4 N103US     46  535. -6.93 
##  5 N104UW     47  535.  1.80 
##  6 N10575    289  520. 20.7  
##  7 N105UW     45  525. -0.267
##  8 N107US     41  529. -5.73 
##  9 N108UW     60  534. -1.25 
## 10 N109UW     48  536. -2.52 
## # … with 2,952 more rows
#用 ggplot2 包作个图观察一下, 发现飞机延时不延时跟飞行距离没太大相关性:
library(ggplot2)
ggplot(ans, aes(dist, delay)) + geom_point(aes(size = count), alpha = 1/2) + geom_smooth() + scale_size_area()

sessionInfo()
## R version 4.0.2 (2020-06-22)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Mojave 10.14.5
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] zh_CN.UTF-8/zh_CN.UTF-8/zh_CN.UTF-8/C/zh_CN.UTF-8/zh_CN.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] ggplot2_3.3.2      purrr_0.3.4        nycflights13_1.0.1 dplyr_1.0.1       
## 
## loaded via a namespace (and not attached):
##  [1] pillar_1.4.6     compiler_4.0.2   tools_4.0.2      digest_0.6.25   
##  [5] evaluate_0.14    lifecycle_0.2.0  tibble_3.0.3     gtable_0.3.0    
##  [9] nlme_3.1-148     lattice_0.20-41  mgcv_1.8-31      pkgconfig_2.0.3 
## [13] rlang_0.4.7      Matrix_1.2-18    cli_2.0.2        yaml_2.2.1      
## [17] blogdown_0.20    xfun_0.17        withr_2.2.0      stringr_1.4.0   
## [21] knitr_1.29       generics_0.0.2   vctrs_0.3.2      grid_4.0.2      
## [25] tidyselect_1.1.0 glue_1.4.1       R6_2.4.1         fansi_0.4.1     
## [29] rmarkdown_2.3    bookdown_0.20    farver_2.0.3     magrittr_1.5    
## [33] scales_1.1.1     ellipsis_0.3.1   htmltools_0.5.0  splines_4.0.2   
## [37] assertthat_0.2.1 colorspace_1.4-1 labeling_0.3     utf8_1.1.4      
## [41] stringi_1.4.6    munsell_0.5.0    crayon_1.3.4

次;