把前两天的两篇文章合并,解决方法:名字还是不能太长,在content目录下新建test目录,把它放在content目录下的test目录,不放在post目录,我的test目录只有两篇文章
1.1、选择行filter()
安装nycflights13
包,该软件包中的飞机航班数据将用于本文中dplyr
包各个函数的演示
library(dplyr)
library(nycflights13)
函数tbl_df()
将过长过大的数据集转换为显示更友好的 tbl_df 类型:
flights <- tbl_df(flights)
head(flights) #有336,776 x 19
## # A tibble: 6 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## # … with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
filter(flights,origin == "JFK",month == 6L) #- 获取六月份所有从”JFK”机场起飞的航班
## # A tibble: 9,472 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 6 1 2 2359 3 341 350
## 2 2013 6 1 538 545 -7 925 922
## 3 2013 6 1 539 540 -1 832 840
## 4 2013 6 1 553 600 -7 700 711
## 5 2013 6 1 554 600 -6 851 908
## 6 2013 6 1 557 600 -3 934 942
## 7 2013 6 1 559 600 -1 856 930
## 8 2013 6 1 606 610 -4 847 906
## 9 2013 6 1 609 615 -6 759 808
## 10 2013 6 1 615 610 5 837 847
## # … with 9,462 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
slice(flights,1:2) #选取前面的1:2行
## # A tibble: 2 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## # … with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
sample_n(flights, 4, replace = TRUE)# 随机选取4条数据记录。
## # A tibble: 4 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 8 22 1857 1705 112 2150 2020
## 2 2013 5 27 1958 1925 33 2152 2129
## 3 2013 9 10 1345 1345 0 1520 1520
## 4 2013 11 21 1058 1100 -2 1311 1304
## # … with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
flights %>% top_n(4,dep_time)
## # A tibble: 29 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 10 30 2400 2359 1 327 337
## 2 2013 11 27 2400 2359 1 515 445
## 3 2013 12 5 2400 2359 1 427 440
## 4 2013 12 9 2400 2359 1 432 440
## 5 2013 12 9 2400 2250 70 59 2356
## 6 2013 12 13 2400 2359 1 432 440
## 7 2013 12 19 2400 2359 1 434 440
## 8 2013 12 29 2400 1700 420 302 2025
## 9 2013 2 7 2400 2359 1 432 436
## 10 2013 2 7 2400 2359 1 443 444
## # … with 19 more rows, and 11 more variables: arr_delay <dbl>, carrier <chr>,
## # flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
1.2、选择列 select()
若要对选择的列进行处理用mutate函数,这里只能对列名进行处理.
ans <- flights %>% select(dep_time,arr_time)
head(ans)
## # A tibble: 6 x 2
## dep_time arr_time
## <int> <int>
## 1 517 830
## 2 533 850
## 3 542 923
## 4 544 1004
## 5 554 812
## 6 554 740
ans <- flights %>% select(day:arr_delay)
head(ans)
## # A tibble: 6 x 7
## day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay
## <int> <int> <int> <dbl> <int> <int> <dbl>
## 1 1 517 515 2 830 819 11
## 2 1 533 529 4 850 830 20
## 3 1 542 540 2 923 850 33
## 4 1 544 545 -1 1004 1022 -18
## 5 1 554 600 -6 812 837 -25
## 6 1 554 558 -4 740 728 12
#选择整数列,若要对选择的列进行处理用mutate函数,这里只能对列名进行处理,比如
ans <- flights %>% head() %>% select_if(is.integer,toupper)
ans
## # A tibble: 6 x 8
## YEAR MONTH DAY DEP_TIME SCHED_DEP_TIME ARR_TIME SCHED_ARR_TIME FLIGHT
## <int> <int> <int> <int> <int> <int> <int> <int>
## 1 2013 1 1 517 515 830 819 1545
## 2 2013 1 1 533 529 850 830 1714
## 3 2013 1 1 542 540 923 850 1141
## 4 2013 1 1 544 545 1004 1022 725
## 5 2013 1 1 554 600 812 837 461
## 6 2013 1 1 554 558 740 728 1696
#选择是字符串的列
ans <- flights %>% head() %>% select_if(is.character)
ans
## # A tibble: 6 x 4
## carrier tailnum origin dest
## <chr> <chr> <chr> <chr>
## 1 UA N14228 EWR IAH
## 2 UA N24211 LGA IAH
## 3 AA N619AA JFK MIA
## 4 B6 N804JB JFK BQN
## 5 DL N668DN LGA ATL
## 6 UA N39463 EWR ORD
选择数值列并且以某个字符开始的列
select_if(iris,is.numeric) %>% select(starts_with("s")) %>% head()
## Sepal.Length Sepal.Width
## 1 5.1 3.5
## 2 4.9 3.0
## 3 4.7 3.2
## 4 4.6 3.1
## 5 5.0 3.6
## 6 5.4 3.9
选择非数值列
# 选择非数值列
## 利用purrr包中的negate函数
library(purrr)
iris %>% select_if(negate(is.numeric)) %>% head()
## Species
## 1 setosa
## 2 setosa
## 3 setosa
## 4 setosa
## 5 setosa
## 6 setosa
iris %>% select_if(~!is.numeric(.x)) %>% head()
## Species
## 1 setosa
## 2 setosa
## 3 setosa
## 4 setosa
## 5 setosa
## 6 setosa
iris %>% select_if(funs(!is.numeric(.))) %>% head()
## Species
## 1 setosa
## 2 setosa
## 3 setosa
## 4 setosa
## 5 setosa
## 6 setosa
iris %>% select_if(~ is.factor(.x)|is.numeric(.x))
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
## 7 4.6 3.4 1.4 0.3 setosa
## 8 5.0 3.4 1.5 0.2 setosa
## 9 4.4 2.9 1.4 0.2 setosa
## 10 4.9 3.1 1.5 0.1 setosa
## 11 5.4 3.7 1.5 0.2 setosa
## 12 4.8 3.4 1.6 0.2 setosa
## 13 4.8 3.0 1.4 0.1 setosa
## 14 4.3 3.0 1.1 0.1 setosa
## 15 5.8 4.0 1.2 0.2 setosa
## 16 5.7 4.4 1.5 0.4 setosa
## 17 5.4 3.9 1.3 0.4 setosa
## 18 5.1 3.5 1.4 0.3 setosa
## 19 5.7 3.8 1.7 0.3 setosa
## 20 5.1 3.8 1.5 0.3 setosa
## 21 5.4 3.4 1.7 0.2 setosa
## 22 5.1 3.7 1.5 0.4 setosa
## 23 4.6 3.6 1.0 0.2 setosa
## 24 5.1 3.3 1.7 0.5 setosa
## 25 4.8 3.4 1.9 0.2 setosa
## 26 5.0 3.0 1.6 0.2 setosa
## 27 5.0 3.4 1.6 0.4 setosa
## 28 5.2 3.5 1.5 0.2 setosa
## 29 5.2 3.4 1.4 0.2 setosa
## 30 4.7 3.2 1.6 0.2 setosa
## 31 4.8 3.1 1.6 0.2 setosa
## 32 5.4 3.4 1.5 0.4 setosa
## 33 5.2 4.1 1.5 0.1 setosa
## 34 5.5 4.2 1.4 0.2 setosa
## 35 4.9 3.1 1.5 0.2 setosa
## 36 5.0 3.2 1.2 0.2 setosa
## 37 5.5 3.5 1.3 0.2 setosa
## 38 4.9 3.6 1.4 0.1 setosa
## 39 4.4 3.0 1.3 0.2 setosa
## 40 5.1 3.4 1.5 0.2 setosa
## 41 5.0 3.5 1.3 0.3 setosa
## 42 4.5 2.3 1.3 0.3 setosa
## 43 4.4 3.2 1.3 0.2 setosa
## 44 5.0 3.5 1.6 0.6 setosa
## 45 5.1 3.8 1.9 0.4 setosa
## 46 4.8 3.0 1.4 0.3 setosa
## 47 5.1 3.8 1.6 0.2 setosa
## 48 4.6 3.2 1.4 0.2 setosa
## 49 5.3 3.7 1.5 0.2 setosa
## 50 5.0 3.3 1.4 0.2 setosa
## 51 7.0 3.2 4.7 1.4 versicolor
## 52 6.4 3.2 4.5 1.5 versicolor
## 53 6.9 3.1 4.9 1.5 versicolor
## 54 5.5 2.3 4.0 1.3 versicolor
## 55 6.5 2.8 4.6 1.5 versicolor
## 56 5.7 2.8 4.5 1.3 versicolor
## 57 6.3 3.3 4.7 1.6 versicolor
## 58 4.9 2.4 3.3 1.0 versicolor
## 59 6.6 2.9 4.6 1.3 versicolor
## 60 5.2 2.7 3.9 1.4 versicolor
## 61 5.0 2.0 3.5 1.0 versicolor
## 62 5.9 3.0 4.2 1.5 versicolor
## 63 6.0 2.2 4.0 1.0 versicolor
## 64 6.1 2.9 4.7 1.4 versicolor
## 65 5.6 2.9 3.6 1.3 versicolor
## 66 6.7 3.1 4.4 1.4 versicolor
## 67 5.6 3.0 4.5 1.5 versicolor
## 68 5.8 2.7 4.1 1.0 versicolor
## 69 6.2 2.2 4.5 1.5 versicolor
## 70 5.6 2.5 3.9 1.1 versicolor
## 71 5.9 3.2 4.8 1.8 versicolor
## 72 6.1 2.8 4.0 1.3 versicolor
## 73 6.3 2.5 4.9 1.5 versicolor
## 74 6.1 2.8 4.7 1.2 versicolor
## 75 6.4 2.9 4.3 1.3 versicolor
## 76 6.6 3.0 4.4 1.4 versicolor
## 77 6.8 2.8 4.8 1.4 versicolor
## 78 6.7 3.0 5.0 1.7 versicolor
## 79 6.0 2.9 4.5 1.5 versicolor
## 80 5.7 2.6 3.5 1.0 versicolor
## 81 5.5 2.4 3.8 1.1 versicolor
## 82 5.5 2.4 3.7 1.0 versicolor
## 83 5.8 2.7 3.9 1.2 versicolor
## 84 6.0 2.7 5.1 1.6 versicolor
## 85 5.4 3.0 4.5 1.5 versicolor
## 86 6.0 3.4 4.5 1.6 versicolor
## 87 6.7 3.1 4.7 1.5 versicolor
## 88 6.3 2.3 4.4 1.3 versicolor
## 89 5.6 3.0 4.1 1.3 versicolor
## 90 5.5 2.5 4.0 1.3 versicolor
## 91 5.5 2.6 4.4 1.2 versicolor
## 92 6.1 3.0 4.6 1.4 versicolor
## 93 5.8 2.6 4.0 1.2 versicolor
## 94 5.0 2.3 3.3 1.0 versicolor
## 95 5.6 2.7 4.2 1.3 versicolor
## 96 5.7 3.0 4.2 1.2 versicolor
## 97 5.7 2.9 4.2 1.3 versicolor
## 98 6.2 2.9 4.3 1.3 versicolor
## 99 5.1 2.5 3.0 1.1 versicolor
## 100 5.7 2.8 4.1 1.3 versicolor
## 101 6.3 3.3 6.0 2.5 virginica
## 102 5.8 2.7 5.1 1.9 virginica
## 103 7.1 3.0 5.9 2.1 virginica
## 104 6.3 2.9 5.6 1.8 virginica
## 105 6.5 3.0 5.8 2.2 virginica
## 106 7.6 3.0 6.6 2.1 virginica
## 107 4.9 2.5 4.5 1.7 virginica
## 108 7.3 2.9 6.3 1.8 virginica
## 109 6.7 2.5 5.8 1.8 virginica
## 110 7.2 3.6 6.1 2.5 virginica
## 111 6.5 3.2 5.1 2.0 virginica
## 112 6.4 2.7 5.3 1.9 virginica
## 113 6.8 3.0 5.5 2.1 virginica
## 114 5.7 2.5 5.0 2.0 virginica
## 115 5.8 2.8 5.1 2.4 virginica
## 116 6.4 3.2 5.3 2.3 virginica
## 117 6.5 3.0 5.5 1.8 virginica
## 118 7.7 3.8 6.7 2.2 virginica
## 119 7.7 2.6 6.9 2.3 virginica
## 120 6.0 2.2 5.0 1.5 virginica
## 121 6.9 3.2 5.7 2.3 virginica
## 122 5.6 2.8 4.9 2.0 virginica
## 123 7.7 2.8 6.7 2.0 virginica
## 124 6.3 2.7 4.9 1.8 virginica
## 125 6.7 3.3 5.7 2.1 virginica
## 126 7.2 3.2 6.0 1.8 virginica
## 127 6.2 2.8 4.8 1.8 virginica
## 128 6.1 3.0 4.9 1.8 virginica
## 129 6.4 2.8 5.6 2.1 virginica
## 130 7.2 3.0 5.8 1.6 virginica
## 131 7.4 2.8 6.1 1.9 virginica
## 132 7.9 3.8 6.4 2.0 virginica
## 133 6.4 2.8 5.6 2.2 virginica
## 134 6.3 2.8 5.1 1.5 virginica
## 135 6.1 2.6 5.6 1.4 virginica
## 136 7.7 3.0 6.1 2.3 virginica
## 137 6.3 3.4 5.6 2.4 virginica
## 138 6.4 3.1 5.5 1.8 virginica
## 139 6.0 3.0 4.8 1.8 virginica
## 140 6.9 3.1 5.4 2.1 virginica
## 141 6.7 3.1 5.6 2.4 virginica
## 142 6.9 3.1 5.1 2.3 virginica
## 143 5.8 2.7 5.1 1.9 virginica
## 144 6.8 3.2 5.9 2.3 virginica
## 145 6.7 3.3 5.7 2.5 virginica
## 146 6.7 3.0 5.2 2.3 virginica
## 147 6.3 2.5 5.0 1.9 virginica
## 148 6.5 3.0 5.2 2.0 virginica
## 149 6.2 3.4 5.4 2.3 virginica
## 150 5.9 3.0 5.1 1.8 virginica
行同时选择——子集
filter(flights,origin == "JFK",month == 6L) %>%select(day:arr_delay)
## # A tibble: 9,472 x 7
## day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay
## <int> <int> <int> <dbl> <int> <int> <dbl>
## 1 1 2 2359 3 341 350 -9
## 2 1 538 545 -7 925 922 3
## 3 1 539 540 -1 832 840 -8
## 4 1 553 600 -7 700 711 -11
## 5 1 554 600 -6 851 908 -17
## 6 1 557 600 -3 934 942 -8
## 7 1 559 600 -1 856 930 -34
## 8 1 606 610 -4 847 906 -19
## 9 1 609 615 -6 759 808 -9
## 10 1 615 610 5 837 847 -10
## # … with 9,462 more rows
1.3、排序arrange()
ans <- arrange(flights,origin,desc(dest)) #对列名加 desc() 进行倒序: 与基本函数order()类似
head(ans)
## # A tibble: 6 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 2 905 822 43 1313 1045
## 2 2013 1 3 848 850 -2 1149 1113
## 3 2013 1 4 901 850 11 1120 1113
## 4 2013 1 6 843 848 -5 1053 1111
## 5 2013 1 7 858 850 8 1105 1113
## 6 2013 1 8 847 850 -3 1116 1113
## # … with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
1.4、添加新变量mutate
在引用函数是 .funs()
的格式为funs(mean(.))
e.g.transmute_if(df,is.integer,funs(as.character(.)))
# 添加新变量(可以多列)
flights %>% mutate(yanwu=arr_delay + dep_delay) %>% head()
## # A tibble: 6 x 20
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## # … with 12 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>, yanwu <dbl>
flights %>% transmute(yanwu=arr_delay + dep_delay) %>% head()
## # A tibble: 6 x 1
## yanwu
## <dbl>
## 1 13
## 2 24
## 3 35
## 4 -19
## 5 -31
## 6 8
有多少航班完全没有延误
#有多少航班完全没有延误
flights %>% mutate(yanwu=arr_delay + dep_delay) %>% filter(yanwu<0) %>% head()
## # A tibble: 6 x 20
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 544 545 -1 1004 1022
## 2 2013 1 1 554 600 -6 812 837
## 3 2013 1 1 557 600 -3 709 723
## 4 2013 1 1 557 600 -3 838 846
## 5 2013 1 1 558 600 -2 849 851
## 6 2013 1 1 558 600 -2 853 856
## # … with 12 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>, yanwu <dbl>
flights %>%head() %>% transmute_if(is.integer,funs(as.character(.))) #把整数类型的列转换为字符串
## # A tibble: 6 x 8
## year month day dep_time sched_dep_time arr_time sched_arr_time flight
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 2013 1 1 517 515 830 819 1545
## 2 2013 1 1 533 529 850 830 1714
## 3 2013 1 1 542 540 923 850 1141
## 4 2013 1 1 544 545 1004 1022 725
## 5 2013 1 1 554 600 812 837 461
## 6 2013 1 1 554 558 740 728 1696
flights
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # … with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
# 可以在同一语句中对刚新增加的列进行操作:
transmute(flights,
gain = arr_delay + dep_delay,
gain_per_hour = gain / (arr_time / 60))
## # A tibble: 336,776 x 2
## gain gain_per_hour
## <dbl> <dbl>
## 1 13 0.940
## 2 24 1.69
## 3 35 2.28
## 4 -19 -1.14
## 5 -31 -2.29
## 6 8 0.649
## 7 14 0.920
## 8 -17 -1.44
## 9 -11 -0.788
## 10 6 0.478
## # … with 336,766 more rows
1.5 汇总(行): summarise()
对数据框调用其它函数进行汇总操作, 返回一维的结果:先用一个简单的数据集iris
iris %>% group_by(Species) %>% summarise(mean(Sepal.Length,na.rm = T))
## # A tibble: 3 x 2
## Species `mean(Sepal.Length, na.rm = T)`
## <fct> <dbl>
## 1 setosa 5.01
## 2 versicolor 5.94
## 3 virginica 6.59
iris %>% group_by(Species) %>% summarise_each(funs(mean))
## # A tibble: 3 x 5
## Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 setosa 5.01 3.43 1.46 0.246
## 2 versicolor 5.94 2.77 4.26 1.33
## 3 virginica 6.59 2.97 5.55 2.03
iris %>% group_by(Species) %>% summarise_each(funs(min(.),max(.)))
## # A tibble: 3 x 9
## Species Sepal.Length_min Sepal.Width_min Petal.Length_min Petal.Width_min
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 setosa 4.3 2.3 1 0.1
## 2 versic… 4.9 2 3 1
## 3 virgin… 4.9 2.2 4.5 1.4
## # … with 4 more variables: Sepal.Length_max <dbl>, Sepal.Width_max <dbl>,
## # Petal.Length_max <dbl>, Petal.Width_max <dbl>
iris %>% group_by(Sepal.Width) %>% summarise(f_count=n()) %>% arrange(desc(f_count))
## # A tibble: 23 x 2
## Sepal.Width f_count
## <dbl> <int>
## 1 3 26
## 2 2.8 14
## 3 3.2 13
## 4 3.4 12
## 5 3.1 11
## 6 2.9 10
## 7 2.7 9
## 8 2.5 8
## 9 3.3 6
## 10 3.5 6
## # … with 13 more rows
#上述等价iris %>% group_by(Sepal.Width) %>% tally(sort=TRUE)
#tally可以一步完成上述工作,group_by()以后第一次使用tally进行n()操作,再一次就是sum(n) sort=TRUE对结果排序,当等于TRUE是降序
#再次使用tally()就是sum()
iris %>% group_by(Sepal.Width) %>% tally(sort=TRUE)
## # A tibble: 23 x 2
## Sepal.Width n
## <dbl> <int>
## 1 3 26
## 2 2.8 14
## 3 3.2 13
## 4 3.4 12
## 5 3.1 11
## 6 2.9 10
## 7 2.7 9
## 8 2.5 8
## 9 3.3 6
## 10 3.5 6
## # … with 13 more rows
iris %>% group_by(Sepal.Width) %>% tally(sort=TRUE)%>%tally()
## # A tibble: 1 x 1
## n
## <int>
## 1 23
1.6、分组动作
以上5个函数已经很方便了, 但是当它们跟分组操作这个概念结合起来时, 那才叫真正的强大! 当对数据集通过 group_by()
添加了分组信息后,filter()
,select()
,mutate()
, arrange()
和 summarise()
函数会自动对这些 tbl 类数据执行分组操作 (R语言泛型函数的优势).
例如: 对飞机航班数据按飞机编号 (tailnum) 进行分组, 计算该飞机航班的次数 (count = n()), 平均飞行距离 (dist = mean(distance, na.rm = TRUE)) 和 延时 (delay = mean(arr_delay, na.rm = TRUE))
ans=flights %>%
group_by(tailnum) %>%
summarise(count=n(),dist=mean(distance,na.rm = T),delay=mean(arr_delay,na.rm = T))
ans %>% head()
## # A tibble: 6 x 4
## tailnum count dist delay
## <chr> <int> <dbl> <dbl>
## 1 D942DN 4 854. 31.5
## 2 N0EGMQ 371 676. 9.98
## 3 N10156 153 758. 12.7
## 4 N102UW 48 536. 2.94
## 5 N103US 46 535. -6.93
## 6 N104UW 47 535. 1.80
ans %>% filter(count>20,dist<2000)
## # A tibble: 2,962 x 4
## tailnum count dist delay
## <chr> <int> <dbl> <dbl>
## 1 N0EGMQ 371 676. 9.98
## 2 N10156 153 758. 12.7
## 3 N102UW 48 536. 2.94
## 4 N103US 46 535. -6.93
## 5 N104UW 47 535. 1.80
## 6 N10575 289 520. 20.7
## 7 N105UW 45 525. -0.267
## 8 N107US 41 529. -5.73
## 9 N108UW 60 534. -1.25
## 10 N109UW 48 536. -2.52
## # … with 2,952 more rows
#用 ggplot2 包作个图观察一下, 发现飞机延时不延时跟飞行距离没太大相关性:
library(ggplot2)
ggplot(ans, aes(dist, delay)) + geom_point(aes(size = count), alpha = 1/2) + geom_smooth() + scale_size_area()
sessionInfo()
## R version 4.0.2 (2020-06-22)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Mojave 10.14.5
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
##
## locale:
## [1] zh_CN.UTF-8/zh_CN.UTF-8/zh_CN.UTF-8/C/zh_CN.UTF-8/zh_CN.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ggplot2_3.3.2 purrr_0.3.4 nycflights13_1.0.1 dplyr_1.0.1
##
## loaded via a namespace (and not attached):
## [1] pillar_1.4.6 compiler_4.0.2 tools_4.0.2 digest_0.6.25
## [5] evaluate_0.14 lifecycle_0.2.0 tibble_3.0.3 gtable_0.3.0
## [9] nlme_3.1-148 lattice_0.20-41 mgcv_1.8-31 pkgconfig_2.0.3
## [13] rlang_0.4.7 Matrix_1.2-18 cli_2.0.2 yaml_2.2.1
## [17] blogdown_0.20 xfun_0.17 withr_2.2.0 stringr_1.4.0
## [21] knitr_1.29 generics_0.0.2 vctrs_0.3.2 grid_4.0.2
## [25] tidyselect_1.1.0 glue_1.4.1 R6_2.4.1 fansi_0.4.1
## [29] rmarkdown_2.3 bookdown_0.20 farver_2.0.3 magrittr_1.5
## [33] scales_1.1.1 ellipsis_0.3.1 htmltools_0.5.0 splines_4.0.2
## [37] assertthat_0.2.1 colorspace_1.4-1 labeling_0.3 utf8_1.1.4
## [41] stringi_1.4.6 munsell_0.5.0 crayon_1.3.4